diff --git a/.circleci/config.yml b/.circleci/config.yml index f12de88b2a3..2c72a22d93a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -146,9 +146,12 @@ commands: install-maven: steps: - run: - name: Install maven + name: Install Maven command: | - sudo apt-get update -y && sudo apt-get install -y maven + wget --no-check-certificate https://dlcdn.apache.org/maven/maven-3/3.9.6/binaries/apache-maven-3.9.6-bin.tar.gz + tar zxf apache-maven-3.9.6-bin.tar.gz + echo "export M2_HOME=$(pwd)/apache-maven-3.9.6" >> $BASH_ENV + echo 'export PATH=$M2_HOME/bin:$PATH' >> $BASH_ENV setup-folly: steps: @@ -231,6 +234,7 @@ executors: - image: zjay437/rocksdb:0.6 linux-java-docker: docker: + # This is the Docker Image used for building RocksJava releases, see: https://github.com/evolvedbinary/docker-rocksjava - image: evolvedbinary/rocksjava:centos6_x64-be jobs: @@ -244,7 +248,7 @@ jobs: - increase-max-open-files-on-macos - install-gflags-on-macos - pre-steps-macos - - run: ulimit -S -n `ulimit -H -n` && OPT=-DCIRCLECI make V=1 J=16 -j16 all + - run: ulimit -S -n `ulimit -H -n` && make V=1 J=16 -j16 all - post-steps build-macos-cmake: @@ -543,7 +547,7 @@ jobs: resource_class: large steps: - pre-steps - - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000 --use_io_uring=0' blackbox_crash_test_with_atomic_flush + - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000' blackbox_crash_test_with_atomic_flush - post-steps build-linux-crashtest-tiered-storage-bb: @@ -553,7 +557,7 @@ jobs: - pre-steps - run: name: "run crashtest" - command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800 --use_io_uring=0' blackbox_crash_test_with_tiered_storage + command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800' blackbox_crash_test_with_tiered_storage no_output_timeout: 100m - post-steps @@ -564,7 +568,7 @@ jobs: - pre-steps - run: name: "run crashtest" - command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800 --use_io_uring=0' whitebox_crash_test_with_tiered_storage + command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800' whitebox_crash_test_with_tiered_storage no_output_timeout: 100m - post-steps @@ -623,7 +627,7 @@ jobs: - windows-build-steps build-linux-java: - executor: linux-docker + executor: linux-java-docker resource_class: large steps: - pre-steps @@ -636,17 +640,13 @@ jobs: which javac && javac -version - run: name: "Test RocksDBJava" - command: make V=1 J=8 -j8 jtest + command: scl enable devtoolset-7 'make V=1 J=8 -j8 jtest' - post-steps build-linux-java-pmd: - machine: - image: ubuntu-2004:202111-02 + executor: linux-java-docker resource_class: large - environment: - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64 steps: - - install-maven - pre-steps - run: name: "Set Java Environment" @@ -655,9 +655,10 @@ jobs: echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV which java && java -version which javac && javac -version + - install-maven - run: name: "PMD RocksDBJava" - command: make V=1 J=8 -j8 jpmd + command: scl enable devtoolset-7 'make V=1 J=8 -j8 jpmd' - post-pmd-steps build-linux-java-static: @@ -877,68 +878,11 @@ jobs: - perform-benchmarks - post-benchmarks -workflows: +workflows: # Only jobs that haven't been successfully migrated to GitHub Actions version: 2 - jobs-linux-run-tests: - jobs: - - build-linux - - build-linux-cmake-with-folly - - build-linux-cmake-with-folly-lite-no-test - - build-linux-gcc-7-with-folly - - build-linux-gcc-7-with-folly-lite-no-test - - build-linux-cmake-with-folly-coroutines - - build-linux-cmake-with-benchmark - - build-linux-encrypted_env-no_compression - jobs-linux-run-tests-san: - jobs: - - build-linux-clang10-asan - - build-linux-clang10-ubsan - - build-linux-clang10-mini-tsan - - build-linux-static_lib-alt_namespace-status_checked - jobs-linux-no-test-run: - jobs: - - build-linux-release - - build-linux-release-rtti - - build-examples - - build-fuzzers - - build-linux-clang-no_test_run - - build-linux-clang-13-no_test_run - - build-linux-gcc-8-no_test_run - - build-linux-gcc-10-cxx20-no_test_run - - build-linux-gcc-11-no_test_run - - build-linux-arm-cmake-no_test_run - jobs-linux-other-checks: - jobs: - - build-linux-clang10-clang-analyze - - build-linux-unity-and-headers - - build-linux-mini-crashtest - jobs-windows: - jobs: - - build-windows-vs2022-avx2 - - build-windows-vs2022 - - build-windows-vs2019 - - build-cmake-mingw - jobs-java: - jobs: - - build-linux-java - - build-linux-java-static - - build-macos-java - - build-macos-java-static - - build-macos-java-static-universal - - build-linux-java-pmd - jobs-macos: - jobs: - - build-macos - - build-macos-cmake: - run_even_tests: true - - build-macos-cmake: - run_even_tests: false jobs-linux-arm: jobs: - build-linux-arm - build-fuzzers: - jobs: - - build-fuzzers benchmark-linux: triggers: - schedule: @@ -958,9 +902,4 @@ workflows: only: - main jobs: - - build-format-compatible - build-linux-arm-test-full - - build-linux-run-microbench - - build-linux-non-shm - - build-linux-clang-13-asan-ubsan-with-folly - - build-linux-valgrind diff --git a/.github/actions/build-folly/action.yml b/.github/actions/build-folly/action.yml new file mode 100644 index 00000000000..70229199958 --- /dev/null +++ b/.github/actions/build-folly/action.yml @@ -0,0 +1,7 @@ +name: build-folly +runs: + using: composite + steps: + - name: Build folly and dependencies + run: make build_folly + shell: bash diff --git a/.github/actions/build-for-benchmarks/action.yml b/.github/actions/build-for-benchmarks/action.yml new file mode 100644 index 00000000000..20e892e8fe8 --- /dev/null +++ b/.github/actions/build-for-benchmarks/action.yml @@ -0,0 +1,8 @@ +name: build-for-benchmarks +runs: + using: composite + steps: + - uses: "./.github/actions/pre-steps" + - name: Linux build for benchmarks + run: make V=1 J=8 -j8 release + shell: bash diff --git a/.github/actions/increase-max-open-files-on-macos/action.yml b/.github/actions/increase-max-open-files-on-macos/action.yml new file mode 100644 index 00000000000..59bd5cb71d0 --- /dev/null +++ b/.github/actions/increase-max-open-files-on-macos/action.yml @@ -0,0 +1,10 @@ +name: increase-max-open-files-on-macos +runs: + using: composite + steps: + - name: Increase max open files + run: |- + sudo sysctl -w kern.maxfiles=1048576 + sudo sysctl -w kern.maxfilesperproc=1048576 + sudo launchctl limit maxfiles 1048576 + shell: bash diff --git a/.github/actions/install-gflags-on-macos/action.yml b/.github/actions/install-gflags-on-macos/action.yml new file mode 100644 index 00000000000..2a864cd35c2 --- /dev/null +++ b/.github/actions/install-gflags-on-macos/action.yml @@ -0,0 +1,7 @@ +name: install-gflags-on-macos +runs: + using: composite + steps: + - name: Install gflags on macos + run: HOMEBREW_NO_AUTO_UPDATE=1 brew install gflags + shell: bash diff --git a/.github/actions/install-gflags/action.yml b/.github/actions/install-gflags/action.yml new file mode 100644 index 00000000000..9cec707062a --- /dev/null +++ b/.github/actions/install-gflags/action.yml @@ -0,0 +1,7 @@ +name: install-gflags +runs: + using: composite + steps: + - name: Install gflags + run: sudo apt-get update -y && sudo apt-get install -y libgflags-dev + shell: bash diff --git a/.github/actions/install-jdk8-on-macos/action.yml b/.github/actions/install-jdk8-on-macos/action.yml new file mode 100644 index 00000000000..3f670bb882e --- /dev/null +++ b/.github/actions/install-jdk8-on-macos/action.yml @@ -0,0 +1,9 @@ +name: install-jdk8-on-macos +runs: + using: composite + steps: + - name: Install JDK 8 on macos + run: |- + HOMEBREW_NO_AUTO_UPDATE=1 brew tap bell-sw/liberica + HOMEBREW_NO_AUTO_UPDATE=1 brew install --cask liberica-jdk8 + shell: bash diff --git a/.github/actions/install-maven/action.yml b/.github/actions/install-maven/action.yml new file mode 100644 index 00000000000..69a925272ac --- /dev/null +++ b/.github/actions/install-maven/action.yml @@ -0,0 +1,11 @@ +name: install-maven +runs: + using: composite + steps: + - name: Install Maven + run: | + wget --no-check-certificate https://dlcdn.apache.org/maven/maven-3/3.9.6/binaries/apache-maven-3.9.6-bin.tar.gz + tar zxf apache-maven-3.9.6-bin.tar.gz + echo "export M2_HOME=$(pwd)/apache-maven-3.9.6" >> $GITHUB_ENV + echo "$(pwd)/apache-maven-3.9.6/bin" >> $GITHUB_PATH + shell: bash diff --git a/.github/actions/perform-benchmarks/action.yml b/.github/actions/perform-benchmarks/action.yml new file mode 100644 index 00000000000..502db796c71 --- /dev/null +++ b/.github/actions/perform-benchmarks/action.yml @@ -0,0 +1,22 @@ +name: perform-benchmarks +runs: + using: composite + steps: + - name: Test low-variance benchmarks + run: "./tools/benchmark_ci.py --db_dir ${{ runner.temp }}/rocksdb-benchmark-datadir --output_dir ${{ runner.temp }}/benchmark-results --num_keys 20000000" + env: + LD_LIBRARY_PATH: "/usr/local/lib" + DURATION_RO: 300 + DURATION_RW: 500 + NUM_THREADS: 1 + MAX_BACKGROUND_JOBS: 4 + CI_TESTS_ONLY: 'true' + WRITE_BUFFER_SIZE_MB: 16 + TARGET_FILE_SIZE_BASE_MB: 16 + MAX_BYTES_FOR_LEVEL_BASE_MB: 64 + COMPRESSION_TYPE: none + CACHE_INDEX_AND_FILTER_BLOCKS: 1 + MIN_LEVEL_TO_COMPRESS: 3 + CACHE_SIZE_MB: 10240 + MB_WRITE_PER_SEC: 2 + shell: bash diff --git a/.github/actions/post-benchmarks/action.yml b/.github/actions/post-benchmarks/action.yml new file mode 100644 index 00000000000..46d47ae2235 --- /dev/null +++ b/.github/actions/post-benchmarks/action.yml @@ -0,0 +1,17 @@ +name: post-benchmarks +runs: + using: composite + steps: + - name: Upload Benchmark Results artifact + uses: actions/upload-artifact@v4.0.0 + with: + name: benchmark-results + path: "${{ runner.temp }}/benchmark-results/**" + if-no-files-found: error + - name: Send benchmark report to visualisation + run: |- + set +e + set +o pipefail + ./build_tools/benchmark_log_tool.py --tsvfile ${{ runner.temp }}/benchmark-results/report.tsv --esdocument https://search-rocksdb-bench-k2izhptfeap2hjfxteolsgsynm.us-west-2.es.amazonaws.com/bench_test3_rix/_doc + true + shell: bash diff --git a/.github/actions/post-steps/action.yml b/.github/actions/post-steps/action.yml new file mode 100644 index 00000000000..4da4b641a1b --- /dev/null +++ b/.github/actions/post-steps/action.yml @@ -0,0 +1,38 @@ +name: post-steps +description: Steps that are taken after a RocksDB job +inputs: + artifact-prefix: + description: Prefix to append to the name of artifacts that are uploaded + required: true + default: "${{ github.job }}" +runs: + using: composite + steps: + - name: Upload Test Results artifact + uses: actions/upload-artifact@v4.0.0 + with: + name: "${{ inputs.artifact-prefix }}-test-results" + path: "${{ runner.temp }}/test-results/**" + - name: Upload DB LOG file artifact + uses: actions/upload-artifact@v4.0.0 + with: + name: "${{ inputs.artifact-prefix }}-db-log-file" + path: LOG + - name: Copy Test Logs (on Failure) + if: ${{ failure() }} + run: | + mkdir -p ${{ runner.temp }}/failure-test-logs + cp -r t/* ${{ runner.temp }}/failure-test-logs + shell: bash + - name: Upload Test Logs (on Failure) artifact + uses: actions/upload-artifact@v4.0.0 + with: + name: "${{ inputs.artifact-prefix }}-failure-test-logs" + path: ${{ runner.temp }}/failure-test-logs/** + if-no-files-found: ignore + - name: Upload Core Dumps artifact + uses: actions/upload-artifact@v4.0.0 + with: + name: "${{ inputs.artifact-prefix }}-core-dumps" + path: "core.*" + if-no-files-found: ignore diff --git a/.github/actions/pre-steps-macos/action.yml b/.github/actions/pre-steps-macos/action.yml new file mode 100644 index 00000000000..d485b610050 --- /dev/null +++ b/.github/actions/pre-steps-macos/action.yml @@ -0,0 +1,5 @@ +name: pre-steps-macos +runs: + using: composite + steps: + - uses: "./.github/actions/pre-steps" diff --git a/.github/actions/pre-steps/action.yml b/.github/actions/pre-steps/action.yml new file mode 100644 index 00000000000..1cc586ddbae --- /dev/null +++ b/.github/actions/pre-steps/action.yml @@ -0,0 +1,18 @@ +name: pre-steps +runs: + using: composite + steps: + - name: Setup Environment Variables + run: |- + echo "GTEST_THROW_ON_FAILURE=0" >> "$GITHUB_ENV" + echo "GTEST_OUTPUT=\"xml:${{ runner.temp }}/test-results/\"" >> "$GITHUB_ENV" + echo "SKIP_FORMAT_BUCK_CHECKS=1" >> "$GITHUB_ENV" + echo "GTEST_COLOR=1" >> "$GITHUB_ENV" + echo "CTEST_OUTPUT_ON_FAILURE=1" >> "$GITHUB_ENV" + echo "CTEST_TEST_TIMEOUT=300" >> "$GITHUB_ENV" + echo "ZLIB_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zlib" >> "$GITHUB_ENV" + echo "BZIP2_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/bzip2" >> "$GITHUB_ENV" + echo "SNAPPY_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/snappy" >> "$GITHUB_ENV" + echo "LZ4_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/lz4" >> "$GITHUB_ENV" + echo "ZSTD_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zstd" >> "$GITHUB_ENV" + shell: bash diff --git a/.github/actions/setup-folly/action.yml b/.github/actions/setup-folly/action.yml new file mode 100644 index 00000000000..41cec847ce6 --- /dev/null +++ b/.github/actions/setup-folly/action.yml @@ -0,0 +1,7 @@ +name: setup-folly +runs: + using: composite + steps: + - name: Checkout folly sources + run: make checkout_folly + shell: bash diff --git a/.github/actions/setup-upstream/action.yml b/.github/actions/setup-upstream/action.yml new file mode 100644 index 00000000000..2a409f7601c --- /dev/null +++ b/.github/actions/setup-upstream/action.yml @@ -0,0 +1,20 @@ +name: build-folly +runs: + using: composite + steps: + - name: Fix repo ownership + # Needed in some cases, as safe.directory setting doesn't take effect + # under env -i + run: chown `whoami` . || true + shell: bash + - name: Set upstream + run: git remote add upstream https://github.com/facebook/rocksdb.git + shell: bash + - name: Fetch upstream + run: git fetch upstream + shell: bash + - name: Git status + # NOTE: some old branch builds under check_format_compatible.sh invoke + # git under env -i + run: git status && git remote -v && env -i git branch + shell: bash diff --git a/.github/actions/windows-build-steps/action.yml b/.github/actions/windows-build-steps/action.yml new file mode 100644 index 00000000000..9213f2e828f --- /dev/null +++ b/.github/actions/windows-build-steps/action.yml @@ -0,0 +1,54 @@ +name: windows-build-steps +runs: + using: composite + steps: + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v1.3.1 + - name: Custom steps + env: + THIRDPARTY_HOME: ${{ github.workspace }}/thirdparty + CMAKE_HOME: C:/Program Files/CMake + CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe + CTEST_BIN: C:/Program Files/CMake/bin/ctest.exe + JAVA_HOME: C:/Program Files/BellSoft/LibericaJDK-8 + SNAPPY_HOME: ${{ github.workspace }}/thirdparty/snappy-1.1.8 + SNAPPY_INCLUDE: ${{ github.workspace }}/thirdparty/snappy-1.1.8;${{ github.workspace }}/thirdparty/snappy-1.1.8/build + SNAPPY_LIB_DEBUG: ${{ github.workspace }}/thirdparty/snappy-1.1.8/build/Debug/snappy.lib + run: |- + # NOTE: if ... Exit $LASTEXITCODE lines needed to exit and report failure + echo ===================== Install Dependencies ===================== + choco install liberica8jdk -y + if(!$?) { Exit $LASTEXITCODE } + mkdir $Env:THIRDPARTY_HOME + cd $Env:THIRDPARTY_HOME + echo "Building Snappy dependency..." + curl -Lo snappy-1.1.8.zip https://github.com/google/snappy/archive/refs/tags/1.1.8.zip + if(!$?) { Exit $LASTEXITCODE } + unzip -q snappy-1.1.8.zip + if(!$?) { Exit $LASTEXITCODE } + cd snappy-1.1.8 + mkdir build + cd build + & cmake -G "$Env:CMAKE_GENERATOR" .. + if(!$?) { Exit $LASTEXITCODE } + msbuild Snappy.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64 + if(!$?) { Exit $LASTEXITCODE } + echo ======================== Build RocksDB ========================= + cd ${{ github.workspace }} + $env:Path = $env:JAVA_HOME + ";" + $env:Path + mkdir build + cd build + & cmake -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE="$Env:CMAKE_PORTABLE" -DSNAPPY=1 -DJNI=1 .. + if(!$?) { Exit $LASTEXITCODE } + cd .. + echo "Building with VS version: $Env:CMAKE_GENERATOR" + msbuild build/rocksdb.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64 + if(!$?) { Exit $LASTEXITCODE } + echo ========================= Test RocksDB ========================= + build_tools\run_ci_db_test.ps1 -SuiteRun arena_test,db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16 + if(!$?) { Exit $LASTEXITCODE } + echo ======================== Test RocksJava ======================== + cd build\java + & ctest -C Debug -j 16 + if(!$?) { Exit $LASTEXITCODE } + shell: pwsh diff --git a/.github/workflows/benchmark-linux.yml b/.github/workflows/benchmark-linux.yml new file mode 100644 index 00000000000..ae09c01b5c1 --- /dev/null +++ b/.github/workflows/benchmark-linux.yml @@ -0,0 +1,15 @@ +name: facebook/rocksdb/benchmark-linux +on: workflow_dispatch +jobs: + # FIXME: when this job is fixed, it should be given a cron schedule like + # schedule: + # - cron: 0 * * * * + # workflow_dispatch: + benchmark-linux: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/build-for-benchmarks" + - uses: "./.github/actions/perform-benchmarks" + - uses: "./.github/actions/post-benchmarks" diff --git a/.github/workflows/nightly-candidate.yml b/.github/workflows/nightly-candidate.yml new file mode 100644 index 00000000000..28a2d3405b8 --- /dev/null +++ b/.github/workflows/nightly-candidate.yml @@ -0,0 +1,18 @@ +name: facebook/rocksdb/nightly +on: workflow_dispatch +jobs: + # These jobs would be in nightly but are failing or otherwise broken for + # some reason. + build-linux-arm-test-full: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: arm64large + container: + image: ubuntu-2004:202111-02 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/install-gflags" + - run: make V=1 J=4 -j4 check + - uses: "./.github/actions/post-steps" diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml new file mode 100644 index 00000000000..1749b52ba7e --- /dev/null +++ b/.github/workflows/nightly.yml @@ -0,0 +1,98 @@ +name: facebook/rocksdb/nightly +on: + schedule: + - cron: 0 9 * * * + workflow_dispatch: +jobs: + build-format-compatible: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + with: + fetch-depth: 0 # Need full repo history + fetch-tags: true + - uses: "./.github/actions/setup-upstream" + - uses: "./.github/actions/pre-steps" + - name: test + run: |- + export TEST_TMPDIR=/dev/shm/rocksdb + rm -rf /dev/shm/rocksdb + mkdir /dev/shm/rocksdb + git config --global --add safe.directory /__w/rocksdb/rocksdb + tools/check_format_compatible.sh + - uses: "./.github/actions/post-steps" + build-linux-run-microbench: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: DEBUG_LEVEL=0 make -j32 run_microbench + - uses: "./.github/actions/post-steps" + build-linux-non-shm: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + env: + TEST_TMPDIR: "/tmp/rocksdb_test_tmp" + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: make V=1 -j32 check + - uses: "./.github/actions/post-steps" + build-linux-clang-13-asan-ubsan-with-folly: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/setup-folly" + - uses: "./.github/actions/build-folly" + - run: CC=clang-13 CXX=clang++-13 LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check + - uses: "./.github/actions/post-steps" + build-linux-valgrind: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: PORTABLE=1 make V=1 -j32 valgrind_test + - uses: "./.github/actions/post-steps" + build-windows-vs2022-avx2: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: windows-2022 + env: + CMAKE_GENERATOR: Visual Studio 17 2022 + CMAKE_PORTABLE: AVX2 + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/windows-build-steps" + build-windows-vs2022: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: windows-2022 + env: + CMAKE_GENERATOR: Visual Studio 17 2022 + CMAKE_PORTABLE: 1 + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/windows-build-steps" diff --git a/.github/workflows/pr-jobs-candidate.yml b/.github/workflows/pr-jobs-candidate.yml new file mode 100644 index 00000000000..5c8e9684221 --- /dev/null +++ b/.github/workflows/pr-jobs-candidate.yml @@ -0,0 +1,46 @@ +name: facebook/rocksdb/pr-jobs-candidate +on: workflow_dispatch +jobs: + # These jobs would be in pr-jobs but are failing or otherwise broken for + # some reason. + # =========================== ARM Jobs ============================ # + build-linux-arm: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: arm64large # GitHub hosted ARM runners do not yet exist + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/install-gflags" + - run: ROCKSDBTESTS_PLATFORM_DEPENDENT=only make V=1 J=4 -j4 all_but_some_tests check_some + - uses: "./.github/actions/post-steps" + build-linux-arm-cmake-no_test_run: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: arm64large # GitHub hosted ARM runners do not yet exist + env: + JAVA_HOME: "/usr/lib/jvm/java-8-openjdk-arm64" + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/install-gflags" + - name: Set Java Environment + run: |- + echo "JAVA_HOME=${JAVA_HOME}" + echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV + which java && java -version + which javac && javac -version + - name: Build with cmake + run: |- + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release -DWITH_TESTS=0 -DWITH_GFLAGS=1 -DWITH_BENCHMARK_TOOLS=0 -DWITH_TOOLS=0 -DWITH_CORE_TOOLS=1 .. + make -j4 + - name: Build Java with cmake + run: |- + rm -rf build + mkdir build + cd build + cmake -DJNI=1 -DCMAKE_BUILD_TYPE=Release -DWITH_GFLAGS=1 .. + make -j4 rocksdb rocksdbjni + - uses: "./.github/actions/post-steps" diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml new file mode 100644 index 00000000000..00f1305a2e4 --- /dev/null +++ b/.github/workflows/pr-jobs.yml @@ -0,0 +1,609 @@ +name: facebook/rocksdb/pr-jobs +on: [push, pull_request] +jobs: + # NOTE: multiple workflows would be recommended, but the current GHA UI in + # PRs doesn't make it clear when there's an overall error with a workflow, + # making it easy to overlook something broken. Grouping everything into one + # workflow minimizes the problem because it will be suspicious if there are + # no GHA results. + # + # The if: ${{ github.repository_owner == 'facebook' }} lines prevent the + # jobs from attempting to run on repo forks, because of a few problems: + # * runs-on labels are repository (owner) specific, so the job might wait + # for days waiting for a runner that simply isn't available. + # * Pushes to branches on forks for pull requests (the normal process) would + # run the workflow jobs twice: once in the pull-from fork and once for the PR + # destination repo. This is wasteful and dumb. + # * It is not known how to avoid copy-pasting the line to each job, + # increasing the risk of misconfiguration, especially on forks that might + # want to run with this GHA setup. + # + # DEBUGGING WITH SSH: Temporarily add this as a job step, either before the + # step of interest without the "if:" line or after the failing step with the + # "if:" line. Then use ssh command printed in CI output. + # - name: Setup tmate session # TEMPORARY! + # if: ${{ failure() }} + # uses: mxschmitt/action-tmate@v3 + # with: + # limit-access-to-actor: true + + # ======================== Fast Initial Checks ====================== # + check-format-and-targets: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4.1.0 + with: + fetch-depth: 0 # Need full checkout to determine merge base + fetch-tags: true + - uses: "./.github/actions/setup-upstream" + - name: Setup Python + uses: actions/setup-python@v5 + - name: Install Dependencies + run: python -m pip install --upgrade pip + - name: Install argparse + run: pip install argparse + - name: Download clang-format-diff.py + run: wget https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py + - name: Check format + run: VERBOSE_CHECK=1 make check-format + - name: Compare buckify output + run: make check-buck-targets + - name: Simple source code checks + run: make check-sources + # ========================= Linux With Tests ======================== # + build-linux: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: make V=1 J=32 -j32 check + - uses: "./.github/actions/post-steps" + build-linux-cmake-mingw: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 4-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix + - name: Build cmake-mingw + run: |- + export PATH=$JAVA_HOME/bin:$PATH + echo "JAVA_HOME=${JAVA_HOME}" + which java && java -version + which javac && javac -version + mkdir build && cd build && cmake -DJNI=1 -DWITH_GFLAGS=OFF .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni + - uses: "./.github/actions/post-steps" + build-linux-cmake-with-folly: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/setup-folly" + - uses: "./.github/actions/build-folly" + - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)" + - uses: "./.github/actions/post-steps" + build-linux-cmake-with-folly-lite-no-test: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/setup-folly" + - run: "(mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 .. && make V=1 -j20)" + - uses: "./.github/actions/post-steps" + build-linux-gcc-7-with-folly: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/setup-folly" + - uses: "./.github/actions/build-folly" + - run: USE_FOLLY=1 LIB_MODE=static CC=gcc-7 CXX=g++-7 V=1 make -j32 check + - uses: "./.github/actions/post-steps" + build-linux-gcc-7-with-folly-lite-no-test: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/setup-folly" + - run: USE_FOLLY_LITE=1 CC=gcc-7 CXX=g++-7 V=1 make -j32 all + - uses: "./.github/actions/post-steps" + build-linux-cmake-with-folly-coroutines: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + env: + CC: gcc-10 + CXX: g++-10 + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/setup-folly" + - uses: "./.github/actions/build-folly" + - run: "(mkdir build && cd build && cmake -DUSE_COROUTINES=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)" + - uses: "./.github/actions/post-steps" + build-linux-cmake-with-benchmark: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 .. && make V=1 -j20 && ctest -j20 + - uses: "./.github/actions/post-steps" + build-linux-encrypted_env-no_compression: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: ENCRYPTED_ENV=1 ROCKSDB_DISABLE_SNAPPY=1 ROCKSDB_DISABLE_ZLIB=1 ROCKSDB_DISABLE_BZIP=1 ROCKSDB_DISABLE_LZ4=1 ROCKSDB_DISABLE_ZSTD=1 make V=1 J=32 -j32 check + - run: "./sst_dump --help | grep -E -q 'Supported compression types: kNoCompression$' # Verify no compiled in compression\n" + - uses: "./.github/actions/post-steps" + # ======================== Linux No Test Runs ======================= # + build-linux-release: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - run: make V=1 -j32 LIB_MODE=shared release + - run: ls librocksdb.so + - run: "./db_stress --version" + - run: make clean + - run: make V=1 -j32 release + - run: ls librocksdb.a + - run: "./db_stress --version" + - run: make clean + - run: apt-get remove -y libgflags-dev + - run: make V=1 -j32 LIB_MODE=shared release + - run: ls librocksdb.so + - run: if ./db_stress --version; then false; else true; fi + - run: make clean + - run: make V=1 -j32 release + - run: ls librocksdb.a + - run: if ./db_stress --version; then false; else true; fi + - uses: "./.github/actions/post-steps" + build-linux-release-rtti: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 8-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench + - run: "./db_stress --version" + - run: make clean + - run: apt-get remove -y libgflags-dev + - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench + - run: if ./db_stress --version; then false; else true; fi + build-examples: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 4-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - name: Build examples + run: make V=1 -j4 static_lib && cd examples && make V=1 -j4 + - uses: "./.github/actions/post-steps" + build-fuzzers: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 4-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - name: Build rocksdb lib + run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j4 static_lib + - name: Build fuzzers + run: cd fuzz && make sst_file_writer_fuzzer db_fuzzer db_map_fuzzer + - uses: "./.github/actions/post-steps" + build-linux-clang-no_test_run: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 8-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - run: CC=clang CXX=clang++ USE_CLANG=1 PORTABLE=1 make V=1 -j16 all + - uses: "./.github/actions/post-steps" + build-linux-clang-13-no_test_run: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j32 all microbench + - uses: "./.github/actions/post-steps" + build-linux-gcc-8-no_test_run: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: CC=gcc-8 CXX=g++-8 V=1 make -j32 all + - uses: "./.github/actions/post-steps" + build-linux-gcc-10-cxx20-no_test_run: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: CC=gcc-10 CXX=g++-10 V=1 ROCKSDB_CXX_STANDARD=c++20 make -j32 all + - uses: "./.github/actions/post-steps" + build-linux-gcc-11-no_test_run: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: LIB_MODE=static CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench + - uses: "./.github/actions/post-steps" + # ======================== Linux Other Checks ======================= # + build-linux-clang10-clang-analyze: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 CLANG_ANALYZER="/usr/bin/clang++-10" CLANG_SCAN_BUILD=scan-build-10 USE_CLANG=1 make V=1 -j32 analyze + - uses: "./.github/actions/post-steps" + - name: compress test report + run: tar -cvzf scan_build_report.tar.gz scan_build_report + if: failure() + - uses: actions/upload-artifact@v4.0.0 + with: + name: scan-build-report + path: scan_build_report.tar.gz + build-linux-unity-and-headers: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 4-core-ubuntu + container: + image: gcc:latest + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - run: apt-get update -y && apt-get install -y libgflags-dev + - name: Unity build + run: make V=1 -j8 unity_test + - run: make V=1 -j8 -k check-headers + - uses: "./.github/actions/post-steps" + build-linux-mini-crashtest: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 4-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000' blackbox_crash_test_with_atomic_flush + - uses: "./.github/actions/post-steps" + # ======================= Linux with Sanitizers ===================== # + build-linux-clang10-asan: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 32-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: COMPILE_WITH_ASAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check + - uses: "./.github/actions/post-steps" + build-linux-clang10-ubsan: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: COMPILE_WITH_UBSAN=1 OPT="-fsanitize-blacklist=.circleci/ubsan_suppression_list.txt" CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 ubsan_check + - uses: "./.github/actions/post-steps" + build-linux-clang10-mini-tsan: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 32-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: COMPILE_WITH_TSAN=1 CC=clang-13 CXX=clang++-13 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check + - uses: "./.github/actions/post-steps" + build-linux-static_lib-alt_namespace-status_checked: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: zjay437/rocksdb:0.6 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=static OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j24 check + - uses: "./.github/actions/post-steps" + # ========================= MacOS build only ======================== # + build-macos: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: macos-13 + env: + ROCKSDB_DISABLE_JEMALLOC: 1 + steps: + - uses: actions/checkout@v4.1.0 + - uses: maxim-lobanov/setup-xcode@v1.6.0 + with: + xcode-version: 14.3.1 + - uses: "./.github/actions/increase-max-open-files-on-macos" + - uses: "./.github/actions/install-gflags-on-macos" + - uses: "./.github/actions/pre-steps-macos" + - name: Build + run: ulimit -S -n `ulimit -H -n` && make V=1 J=16 -j16 all + - uses: "./.github/actions/post-steps" + # ========================= MacOS with Tests ======================== # + build-macos-cmake: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: macos-13 + strategy: + matrix: + run_even_tests: [true, false] + steps: + - uses: actions/checkout@v4.1.0 + - uses: maxim-lobanov/setup-xcode@v1.6.0 + with: + xcode-version: 14.3.1 + - uses: "./.github/actions/increase-max-open-files-on-macos" + - uses: "./.github/actions/install-gflags-on-macos" + - uses: "./.github/actions/pre-steps-macos" + - name: cmake generate project file + run: ulimit -S -n `ulimit -H -n` && mkdir build && cd build && cmake -DWITH_GFLAGS=1 .. + - name: Build tests + run: cd build && make V=1 -j16 + - name: Run even tests + run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j16 -I 0,,2 + if: ${{ matrix.run_even_tests }} + - name: Run odd tests + run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j16 -I 1,,2 + if: ${{ ! matrix.run_even_tests }} + - uses: "./.github/actions/post-steps" + # ======================== Windows with Tests ======================= # + # NOTE: some windows jobs are in "nightly" to save resources + build-windows-vs2019: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: windows-2019 + env: + CMAKE_GENERATOR: Visual Studio 16 2019 + CMAKE_PORTABLE: 1 + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/windows-build-steps" + # ============================ Java Jobs ============================ # + build-linux-java: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 4-core-ubuntu + container: + image: evolvedbinary/rocksjava:centos6_x64-be + options: --shm-size=16gb + steps: + # The docker image is intentionally based on an OS that has an older GLIBC version. + # That GLIBC is incompatibile with GitHub's actions/checkout. Thus we implement a manual checkout step. + - name: Checkout + env: + GH_TOKEN: ${{ github.token }} + run: | + chown `whoami` . || true + git clone --no-checkout https://oath2:$GH_TOKEN@github.com/${{ github.repository }}.git . + git -c protocol.version=2 fetch --update-head-ok --no-tags --prune --no-recurse-submodules --depth=1 origin +${{ github.sha }}:${{ github.ref }} + git checkout --progress --force ${{ github.ref }} + git log -1 --format='%H' + - uses: "./.github/actions/pre-steps" + - name: Set Java Environment + run: |- + echo "JAVA_HOME=${JAVA_HOME}" + which java && java -version + which javac && javac -version + - name: Test RocksDBJava + run: scl enable devtoolset-7 'make V=1 J=8 -j8 jtest' + # NOTE: post-steps skipped because of compatibility issues with docker image + build-linux-java-static: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 4-core-ubuntu + container: + image: evolvedbinary/rocksjava:centos6_x64-be + options: --shm-size=16gb + steps: + # The docker image is intentionally based on an OS that has an older GLIBC version. + # That GLIBC is incompatibile with GitHub's actions/checkout. Thus we implement a manual checkout step. + - name: Checkout + env: + GH_TOKEN: ${{ github.token }} + run: | + chown `whoami` . || true + git clone --no-checkout https://oath2:$GH_TOKEN@github.com/${{ github.repository }}.git . + git -c protocol.version=2 fetch --update-head-ok --no-tags --prune --no-recurse-submodules --depth=1 origin +${{ github.sha }}:${{ github.ref }} + git checkout --progress --force ${{ github.ref }} + git log -1 --format='%H' + - uses: "./.github/actions/pre-steps" + - name: Set Java Environment + run: |- + echo "JAVA_HOME=${JAVA_HOME}" + which java && java -version + which javac && javac -version + - name: Build RocksDBJava Static Library + run: scl enable devtoolset-7 'make V=1 J=8 -j8 rocksdbjavastatic' + # NOTE: post-steps skipped because of compatibility issues with docker image + build-macos-java: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: macos-13 + env: + JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home" + ROCKSDB_DISABLE_JEMALLOC: 1 + steps: + - uses: actions/checkout@v4.1.0 + - uses: maxim-lobanov/setup-xcode@v1.6.0 + with: + xcode-version: 14.3.1 + - uses: "./.github/actions/increase-max-open-files-on-macos" + - uses: "./.github/actions/install-gflags-on-macos" + - uses: "./.github/actions/install-jdk8-on-macos" + - uses: "./.github/actions/pre-steps-macos" + - name: Set Java Environment + run: |- + echo "JAVA_HOME=${JAVA_HOME}" + which java && java -version + which javac && javac -version + - name: Test RocksDBJava + run: make V=1 J=16 -j16 jtest + - uses: "./.github/actions/post-steps" + build-macos-java-static: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: macos-13 + env: + JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home" + steps: + - uses: actions/checkout@v4.1.0 + - uses: maxim-lobanov/setup-xcode@v1.6.0 + with: + xcode-version: 14.3.1 + - uses: "./.github/actions/increase-max-open-files-on-macos" + - uses: "./.github/actions/install-gflags-on-macos" + - uses: "./.github/actions/install-jdk8-on-macos" + - uses: "./.github/actions/pre-steps-macos" + - name: Set Java Environment + run: |- + echo "JAVA_HOME=${JAVA_HOME}" + which java && java -version + which javac && javac -version + - name: Build RocksDBJava x86 and ARM Static Libraries + run: make V=1 J=16 -j16 rocksdbjavastaticosx + - uses: "./.github/actions/post-steps" + build-macos-java-static-universal: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: macos-13 + env: + JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home" + steps: + - uses: actions/checkout@v4.1.0 + - uses: maxim-lobanov/setup-xcode@v1.6.0 + with: + xcode-version: 14.3.1 + - uses: "./.github/actions/increase-max-open-files-on-macos" + - uses: "./.github/actions/install-gflags-on-macos" + - uses: "./.github/actions/install-jdk8-on-macos" + - uses: "./.github/actions/pre-steps-macos" + - name: Set Java Environment + run: |- + echo "JAVA_HOME=${JAVA_HOME}" + which java && java -version + which javac && javac -version + - name: Build RocksDBJava Universal Binary Static Library + run: make V=1 J=16 -j16 rocksdbjavastaticosx_ub + - uses: "./.github/actions/post-steps" + build-linux-java-pmd: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 4-core-ubuntu + container: + image: evolvedbinary/rocksjava:rockylinux8_x64-be + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/install-maven" + - uses: "./.github/actions/pre-steps" + - name: Set Java Environment + run: |- + echo "JAVA_HOME=${JAVA_HOME}" + which java && java -version + which javac && javac -version + - name: PMD RocksDBJava + run: make V=1 J=8 -j8 jpmd + - uses: actions/upload-artifact@v4.0.0 + with: + name: pmd-report + path: "${{ github.workspace }}/java/target/pmd.xml" + - uses: actions/upload-artifact@v4.0.0 + with: + name: maven-site + path: "${{ github.workspace }}/java/target/site" diff --git a/.github/workflows/sanity_check.yml b/.github/workflows/sanity_check.yml deleted file mode 100644 index efc9d99cf37..00000000000 --- a/.github/workflows/sanity_check.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: Check buck targets and code format -on: [push, pull_request] -permissions: - contents: read - -jobs: - check: - name: Check TARGETS file and code format - runs-on: ubuntu-latest - steps: - - name: Checkout feature branch - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Fetch from upstream - run: | - git remote add upstream https://github.com/facebook/rocksdb.git && git fetch upstream - - - name: Where am I - run: | - echo git status && git status - echo "git remote -v" && git remote -v - echo git branch && git branch - - - name: Setup Python - uses: actions/setup-python@v1 - - - name: Install Dependencies - run: python -m pip install --upgrade pip - - - name: Install argparse - run: pip install argparse - - - name: Download clang-format-diff.py - run: wget https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py - - - name: Check format - run: VERBOSE_CHECK=1 make check-format - - - name: Compare buckify output - run: make check-buck-targets - - - name: Simple source code checks - run: make check-sources diff --git a/CMakeLists.txt b/CMakeLists.txt index b7dd3cb8092..e088e94fdc8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,29 @@ project(rocksdb HOMEPAGE_URL https://rocksdb.org/ LANGUAGES CXX C ASM) +if(APPLE) + # On macOS Cmake, when cross-compiling, sometimes CMAKE_SYSTEM_PROCESSOR wrongfully stays + # the same as CMAKE_HOST_SYSTEM_PROCESSOR regardless the target CPU. + # The manual call to set(CMAKE_SYSTEM_PROCESSOR) has to be set after the project() call. + # because project() might reset CMAKE_SYSTEM_PROCESSOR back to the value of CMAKE_HOST_SYSTEM_PROCESSOR. + # Check if CMAKE_SYSTEM_PROCESSOR is not equal to CMAKE_OSX_ARCHITECTURES + if(NOT CMAKE_OSX_ARCHITECTURES STREQUAL "") + if(NOT CMAKE_SYSTEM_PROCESSOR STREQUAL CMAKE_OSX_ARCHITECTURES) + # Split CMAKE_OSX_ARCHITECTURES into a list + string(REPLACE ";" " " ARCH_LIST ${CMAKE_OSX_ARCHITECTURES}) + separate_arguments(ARCH_LIST UNIX_COMMAND ${ARCH_LIST}) + # Count the number of architectures + list(LENGTH ARCH_LIST ARCH_COUNT) + # Ensure that exactly one architecture is specified + if(NOT ARCH_COUNT EQUAL 1) + message(FATAL_ERROR "CMAKE_OSX_ARCHITECTURES must have exactly one value. Current value: ${CMAKE_OSX_ARCHITECTURES}") + endif() + set(CMAKE_SYSTEM_PROCESSOR ${CMAKE_OSX_ARCHITECTURES}) + message(STATUS "CMAKE_SYSTEM_PROCESSOR is manually set to ${CMAKE_SYSTEM_PROCESSOR}") + endif() + endif() +endif() + if(POLICY CMP0042) cmake_policy(SET CMP0042 NEW) endif() @@ -77,11 +100,6 @@ if (WITH_WINDOWS_UTF8_FILENAMES) endif() option(ROCKSDB_BUILD_SHARED "Build shared versions of the RocksDB libraries" ON) -if ($ENV{CIRCLECI}) - message(STATUS "Build for CircieCI env, a few tests may be disabled") - add_definitions(-DCIRCLECI) -endif() - if( NOT DEFINED CMAKE_CXX_STANDARD ) set(CMAKE_CXX_STANDARD 17) endif() @@ -172,7 +190,7 @@ else() if(WITH_ZSTD) find_package(zstd REQUIRED) add_definitions(-DZSTD) - include_directories(${ZSTD_INCLUDE_DIR}) + include_directories(${ZSTD_INCLUDE_DIRS}) list(APPEND THIRDPARTY_LIBS zstd::zstd) endif() endif() @@ -195,7 +213,7 @@ endif() if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4800 /wd4996 /wd4351 /wd4100 /wd4204 /wd4324") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4996 /wd4100 /wd4324") else() set(CMAKE_CXX_FLAGS "-W -Wextra -Wall ${CMAKE_CXX_FLAGS} -pthread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing -Wno-invalid-offsetof") @@ -204,6 +222,7 @@ else() endif() if(MINGW) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wa,-mbig-obj") add_definitions(-D_POSIX_C_SOURCE=1) endif() if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") @@ -384,7 +403,7 @@ option(WITH_NUMA "build with NUMA policy support" OFF) if(WITH_NUMA) find_package(NUMA REQUIRED) add_definitions(-DNUMA) - include_directories(${NUMA_INCLUDE_DIR}) + include_directories(${NUMA_INCLUDE_DIRS}) list(APPEND THIRDPARTY_LIBS NUMA::NUMA) endif() @@ -703,6 +722,7 @@ set(SOURCES db/memtable_list.cc db/merge_helper.cc db/merge_operator.cc + db/multi_cf_iterator.cc db/output_validator.cc db/periodic_task_scheduler.cc db/range_del_aggregator.cc @@ -1377,6 +1397,7 @@ if(WITH_TESTS) db/memtable_list_test.cc db/merge_helper_test.cc db/merge_test.cc + db/multi_cf_iterator_test.cc db/options_file_test.cc db/perf_context_test.cc db/periodic_task_scheduler_test.cc @@ -1457,6 +1478,7 @@ if(WITH_TESTS) util/ribbon_test.cc util/slice_test.cc util/slice_transform_test.cc + util/string_util_test.cc util/timer_queue_test.cc util/timer_test.cc util/thread_list_test.cc @@ -1506,7 +1528,7 @@ if(WITH_TESTS) utilities/cassandra/test_utils.cc ) enable_testing() - add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) + add_custom_target(rocksdb_check COMMAND ${CMAKE_CTEST_COMMAND}) set(TESTUTILLIB testutillib${ARTIFACT_SUFFIX}) add_library(${TESTUTILLIB} STATIC ${TESTUTIL_SOURCE}) target_link_libraries(${TESTUTILLIB} ${ROCKSDB_LIB} ${FOLLY_LIBS}) @@ -1531,7 +1553,7 @@ if(WITH_TESTS) target_link_libraries(${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} testharness gtest ${THIRDPARTY_LIBS} ${ROCKSDB_LIB}) if(NOT "${exename}" MATCHES "db_sanity_test") gtest_discover_tests(${exename} DISCOVERY_TIMEOUT 120) - add_dependencies(check ${exename}${ARTIFACT_SUFFIX}) + add_dependencies(rocksdb_check ${exename}${ARTIFACT_SUFFIX}) endif() endforeach(sourcefile ${TESTS}) @@ -1551,7 +1573,7 @@ if(WITH_TESTS) add_executable(c_test db/c_test.c) target_link_libraries(c_test ${ROCKSDB_LIB_FOR_C} testharness) add_test(NAME c_test COMMAND c_test${ARTIFACT_SUFFIX}) - add_dependencies(check c_test) + add_dependencies(rocksdb_check c_test) endif() endif() diff --git a/HISTORY.md b/HISTORY.md index cf6b2f857dc..e2eea4da1d6 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,9 +1,149 @@ # Rocksdb Change Log > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt` -## 8.9.1 (12/8/2023) +## 9.1.1 (04/17/2024) ### Bug Fixes +* Fixed Java `SstFileMetaData` to prevent throwing `java.lang.NoSuchMethodError` +* Fixed a regression when `ColumnFamilyOptions::max_successive_merges > 0` where the CPU overhead for deciding whether to merge could have increased unless the user had set the option `ColumnFamilyOptions::strict_max_successive_merges` + +## 9.1.0 (03/22/2024) +### New Features +* Added an option, `GetMergeOperandsOptions::continue_cb`, to give users the ability to end `GetMergeOperands()`'s lookup process before all merge operands were found. +* *Add sanity checks for ingesting external files that currently checks if the user key comparator used to create the file is compatible with the column family's user key comparator. +*Support ingesting external files for column family that has user-defined timestamps in memtable only enabled. +* On file systems that support storage level data checksum and reconstruction, retry SST block reads for point lookups, scans, and flush and compaction if there's a checksum mismatch on the initial read. +* Some enhancements and fixes to experimental Temperature handling features, including new `default_write_temperature` CF option and opening an `SstFileWriter` with a temperature. +* `WriteBatchWithIndex` now supports wide-column point lookups via the `GetEntityFromBatch` API. See the API comments for more details. +* *Implement experimental features: API `Iterator::GetProperty("rocksdb.iterator.write-time")` to allow users to get data's approximate write unix time and write data with a specific write time via `WriteBatch::TimedPut` API. + +### Public API Changes +* Best-effort recovery (`best_efforts_recovery == true`) may now be used together with atomic flush (`atomic_flush == true`). The all-or-nothing recovery guarantee for atomically flushed data will be upheld. +* Remove deprecated option `bottommost_temperature`, already replaced by `last_level_temperature` +* Added new PerfContext counters for block cache bytes read - block_cache_index_read_byte, block_cache_filter_read_byte, block_cache_compression_dict_read_byte, and block_cache_read_byte. +* Deprecate experimental Remote Compaction APIs - StartV2() and WaitForCompleteV2() and introduce Schedule() and Wait(). The new APIs essentially does the same thing as the old APIs. They allow taking externally generated unique id to wait for remote compaction to complete. +* *For API `WriteCommittedTransaction::GetForUpdate`, if the column family enables user-defined timestamp, it was mandated that argument `do_validate` cannot be false, and UDT based validation has to be done with a user set read timestamp. It's updated to make the UDT based validation optional if user sets `do_validate` to false and does not set a read timestamp. With this, `GetForUpdate` skips UDT based validation and it's users' responsibility to enforce the UDT invariant. SO DO NOT skip this UDT-based validation if users do not have ways to enforce the UDT invariant. Ways to enforce the invariant on the users side include manage a monotonically increasing timestamp, commit transactions in a single thread etc. +* Defined a new PerfLevel `kEnableWait` to measure time spent by user threads blocked in RocksDB other than mutex, such as a write thread waiting to be added to a write group, a write thread delayed or stalled etc. +* `RateLimiter`'s API no longer requires the burst size to be the refill size. Users of `NewGenericRateLimiter()` can now provide burst size in `single_burst_bytes`. Implementors of `RateLimiter::SetSingleBurstBytes()` need to adapt their implementations to match the changed API doc. +* Add `write_memtable_time` to the newly introduced PerfLevel `kEnableWait`. + +### Behavior Changes +* `RateLimiter`s created by `NewGenericRateLimiter()` no longer modify the refill period when `SetSingleBurstBytes()` is called. +* Merge writes will only keep merge operand count within `ColumnFamilyOptions::max_successive_merges` when the key's merge operands are all found in memory, unless `strict_max_successive_merges` is explicitly set. + +### Bug Fixes +* Fixed `kBlockCacheTier` reads to return `Status::Incomplete` when I/O is needed to fetch a merge chain's base value from a blob file. +* Fixed `kBlockCacheTier` reads to return `Status::Incomplete` on table cache miss rather than incorrectly returning an empty value. +* Fixed a data race in WalManager that may affect how frequent PurgeObsoleteWALFiles() runs. +* Re-enable the recycle_log_file_num option in DBOptions for kPointInTimeRecovery WAL recovery mode, which was previously disabled due to a bug in the recovery logic. This option is incompatible with WriteOptions::disableWAL. A Status::InvalidArgument() will be returned if disableWAL is specified. + +### Performance Improvements +* Java API `multiGet()` variants now take advantage of the underlying batched `multiGet()` performance improvements. +Before +``` +Benchmark (columnFamilyTestType) (keyCount) (keySize) (multiGetSize) (valueSize) Mode Cnt Score Error Units +MultiGetBenchmarks.multiGetList10 no_column_family 10000 16 100 64 thrpt 25 6315.541 ± 8.106 ops/s +MultiGetBenchmarks.multiGetList10 no_column_family 10000 16 100 1024 thrpt 25 6975.468 ± 68.964 ops/s +``` +After +``` +Benchmark (columnFamilyTestType) (keyCount) (keySize) (multiGetSize) (valueSize) Mode Cnt Score Error Units +MultiGetBenchmarks.multiGetList10 no_column_family 10000 16 100 64 thrpt 25 7046.739 ± 13.299 ops/s +MultiGetBenchmarks.multiGetList10 no_column_family 10000 16 100 1024 thrpt 25 7654.521 ± 60.121 ops/s +``` + +## 9.0.0 (02/16/2024) +### New Features +* Provide support for FSBuffer for point lookups. Also added support for scans and compactions that don't go through prefetching. +* *Make `SstFileWriter` create SST files without persisting user defined timestamps when the `Option.persist_user_defined_timestamps` flag is set to false. +* Add support for user-defined timestamps in APIs `DeleteFilesInRanges` and `GetPropertiesOfTablesInRange`. +* Mark wal\_compression feature as production-ready. Currently only compatible with ZSTD compression. + +### Public API Changes +* Allow setting Stderr logger via C API +* Declare one Get and one MultiGet variant as pure virtual, and make all the other variants non-overridable. The methods required to be implemented by derived classes of DB allow returning timestamps. It is up to the implementation to check and return an error if timestamps are not supported. The non-batched MultiGet APIs are reimplemented in terms of batched MultiGet, so callers might see a performance improvement. +* Exposed mode option to Rate Limiter via c api. +* Removed deprecated option `access_hint_on_compaction_start` +* Removed deprecated option `ColumnFamilyOptions::check_flush_compaction_key_order` +* *Remove the default `WritableFile::GetFileSize` and `FSWritableFile::GetFileSize` implementation that returns 0 and make it pure virtual, so that subclasses are enforced to explicitly provide an implementation. +* Removed deprecated option `ColumnFamilyOptions::level_compaction_dynamic_file_size` +* *Removed tickers with typos "rocksdb.error.handler.bg.errro.count", "rocksdb.error.handler.bg.io.errro.count", "rocksdb.error.handler.bg.retryable.io.errro.count". +* Remove the force mode for `EnableFileDeletions` API because it is unsafe with no known legitimate use. +* Removed deprecated option `ColumnFamilyOptions::ignore_max_compaction_bytes_for_input` +* `sst_dump --command=check` now compares the number of records in a table with `num_entries` in table property, and reports corruption if there is a mismatch. API `SstFileDumper::ReadSequential()` is updated to optionally do this verification. (#12322) + +### Behavior Changes +* format\_version=6 is the new default setting in BlockBasedTableOptions, for more robust data integrity checking. DBs and SST files written with this setting cannot be read by RocksDB versions before 8.6.0. +* Compactions can be scheduled in parallel in an additional scenario: multiple files are marked for compaction within a single column family +* For leveled compaction, RocksDB will try to do intra-L0 compaction if the total L0 size is small compared to Lbase (#12214). Users with atomic_flush=true are more likely to see the impact of this change. + +### Bug Fixes +* Fixed a data race in `DBImpl::RenameTempFileToOptionsFile`. +* Fix some perf context statistics error in write steps. which include missing write_memtable_time in unordered_write. missing write_memtable_time in PipelineWrite when Writer stat is STATE_PARALLEL_MEMTABLE_WRITER. missing write_delay_time when calling DelayWrite in WriteImplWALOnly function. +* Fixed a bug that can, under rare circumstances, cause MultiGet to return an incorrect result for a duplicate key in a MultiGet batch. +* Fix a bug where older data of an ingested key can be returned for read when universal compaction is used + +## 8.11.0 (01/19/2024) +### New Features +* Add new statistics: `rocksdb.sst.write.micros` measures time of each write to SST file; `rocksdb.file.write.{flush|compaction|db.open}.micros` measure time of each write to SST table (currently only block-based table format) and blob file for flush, compaction and db open. + +### Public API Changes +* Added another enumerator `kVerify` to enum class `FileOperationType` in listener.h. Update your `switch` statements as needed. +* Add CompressionOptions to the CompressedSecondaryCacheOptions structure to allow users to specify library specific options when creating the compressed secondary cache. +* Deprecated several options: `level_compaction_dynamic_file_size`, `ignore_max_compaction_bytes_for_input`, `check_flush_compaction_key_order`, `flush_verify_memtable_count`, `compaction_verify_record_count`, `fail_if_options_file_error`, and `enforce_single_del_contracts` +* Exposed options ttl via c api. + +### Behavior Changes +* `rocksdb.blobdb.blob.file.write.micros` expands to also measure time writing the header and footer. Therefore the COUNT may be higher and values may be smaller than before. For stacked BlobDB, it no longer measures the time of explictly flushing blob file. +* Files will be compacted to the next level if the data age exceeds periodic_compaction_seconds except for the last level. +* Reduced the compaction debt ratio trigger for scheduling parallel compactions +* For leveled compaction with default compaction pri (kMinOverlappingRatio), files marked for compaction will be prioritized over files not marked when picking a file from a level for compaction. + +### Bug Fixes +* Fix bug in auto_readahead_size that combined with IndexType::kBinarySearchWithFirstKey + fails or iterator lands at a wrong key +* Fixed some cases in which DB file corruption was detected but ignored on creating a backup with BackupEngine. +* Fix bugs where `rocksdb.blobdb.blob.file.synced` includes blob files failed to get synced and `rocksdb.blobdb.blob.file.bytes.written` includes blob bytes failed to get written. +* Fixed a possible memory leak or crash on a failure (such as I/O error) in automatic atomic flush of multiple column families. +* Fixed some cases of in-memory data corruption using mmap reads with `BackupEngine`, `sst_dump`, or `ldb`. +* Fixed issues with experimental `preclude_last_level_data_seconds` option that could interfere with expected data tiering. +* Fixed the handling of the edge case when all existing blob files become unreferenced. Such files are now correctly deleted. + +## 8.10.0 (12/15/2023) +### New Features +* Provide support for async_io to trim readahead_size by doing block cache lookup +* Added initial wide-column support in `WriteBatchWithIndex`. This includes the `PutEntity` API and support for wide columns in the existing read APIs (`GetFromBatch`, `GetFromBatchAndDB`, `MultiGetFromBatchAndDB`, and `BaseDeltaIterator`). + +### Public API Changes +* Custom implementations of `TablePropertiesCollectorFactory` may now return a `nullptr` collector to decline processing a file, reducing callback overheads in such cases. + +### Behavior Changes +* Make ReadOptions.auto_readahead_size default true which does prefetching optimizations for forward scans if iterate_upper_bound and block_cache is also specified. +* Compactions can be scheduled in parallel in an additional scenario: high compaction debt relative to the data size +* HyperClockCache now has built-in protection against excessive CPU consumption under the extreme stress condition of no (or very few) evictable cache entries, which can slightly increase memory usage such conditions. New option `HyperClockCacheOptions::eviction_effort_cap` controls the space-time trade-off of the response. The default should be generally well-balanced, with no measurable affect on normal operation. + +### Bug Fixes +* Fix a corner case with auto_readahead_size where Prev Operation returns NOT SUPPORTED error when scans direction is changed from forward to backward. * Avoid destroying the periodic task scheduler's default timer in order to prevent static destruction order issues. +* Fix double counting of BYTES_WRITTEN ticker when doing writes with transactions. +* Fix a WRITE_STALL counter that was reporting wrong value in few cases. +* A lookup by MultiGet in a TieredCache that goes to the local flash cache and finishes with very low latency, i.e before the subsequent call to WaitAll, is ignored, resulting in a false negative and a memory leak. + +### Performance Improvements +* Java API extensions to improve consistency and completeness of APIs +1 Extended `RocksDB.get([ColumnFamilyHandle columnFamilyHandle,] ReadOptions opt, ByteBuffer key, ByteBuffer value)` which now accepts indirect buffer parameters as well as direct buffer parameters +2 Extended `RocksDB.put( [ColumnFamilyHandle columnFamilyHandle,] WriteOptions writeOpts, final ByteBuffer key, final ByteBuffer value)` which now accepts indirect buffer parameters as well as direct buffer parameters +3 Added `RocksDB.merge([ColumnFamilyHandle columnFamilyHandle,] WriteOptions writeOptions, ByteBuffer key, ByteBuffer value)` methods with the same parameter options as `put(...)` - direct and indirect buffers are supported +4 Added `RocksIterator.key( byte[] key [, int offset, int len])` methods which retrieve the iterator key into the supplied buffer +5 Added `RocksIterator.value( byte[] value [, int offset, int len])` methods which retrieve the iterator value into the supplied buffer +6 Deprecated `get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions readOptions, byte[])` in favour of `get(final ReadOptions readOptions, final ColumnFamilyHandle columnFamilyHandle, byte[])` which has consistent parameter ordering with other methods in the same class +7 Added `Transaction.get( ReadOptions opt, [ColumnFamilyHandle columnFamilyHandle, ] byte[] key, byte[] value)` methods which retrieve the requested value into the supplied buffer +8 Added `Transaction.get( ReadOptions opt, [ColumnFamilyHandle columnFamilyHandle, ] ByteBuffer key, ByteBuffer value)` methods which retrieve the requested value into the supplied buffer +9 Added `Transaction.getForUpdate( ReadOptions readOptions, [ColumnFamilyHandle columnFamilyHandle, ] byte[] key, byte[] value, boolean exclusive [, boolean doValidate])` methods which retrieve the requested value into the supplied buffer +10 Added `Transaction.getForUpdate( ReadOptions readOptions, [ColumnFamilyHandle columnFamilyHandle, ] ByteBuffer key, ByteBuffer value, boolean exclusive [, boolean doValidate])` methods which retrieve the requested value into the supplied buffer +11 Added `Transaction.getIterator()` method as a convenience which defaults the `ReadOptions` value supplied to existing `Transaction.iterator()` methods. This mirrors the existing `RocksDB.iterator()` method. +12 Added `Transaction.put([ColumnFamilyHandle columnFamilyHandle, ] ByteBuffer key, ByteBuffer value [, boolean assumeTracked])` methods which supply the key, and the value to be written in a `ByteBuffer` parameter +13 Added `Transaction.merge([ColumnFamilyHandle columnFamilyHandle, ] ByteBuffer key, ByteBuffer value [, boolean assumeTracked])` methods which supply the key, and the value to be written/merged in a `ByteBuffer` parameter +14 Added `Transaction.mergeUntracked([ColumnFamilyHandle columnFamilyHandle, ] ByteBuffer key, ByteBuffer value)` methods which supply the key, and the value to be written/merged in a `ByteBuffer` parameter + ## 8.9.0 (11/17/2023) ### New Features @@ -359,7 +499,6 @@ For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction * Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files. * Improve RoundRobin TTL compaction, which is going to be the same as normal RoundRobin compaction to move the compaction cursor. * Fix a small CPU regression caused by a change that UserComparatorWrapper was made Customizable, because Customizable itself has small CPU overhead for initialization. -* Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877). ### Behavior Changes * Sanitize min_write_buffer_number_to_merge to 1 if atomic flush is enabled to prevent unexpected data loss when WAL is disabled in a multi-column-family setting (#10773). @@ -1732,7 +1871,6 @@ Note: The next release will be major release 7.0. See https://github.com/faceboo * Add whole key bloom filter support in memtable. * Files written by `SstFileWriter` will now use dictionary compression if it is configured in the file writer's `CompressionOptions`. -## 5.18.2 (01/31/2019) ### Public API Change * Disallow CompactionFilter::IgnoreSnapshots() = false, because it is not very useful and the behavior is confusing. The filter will filter everything if there is no snapshot declared by the time the compaction starts. However, users can define a snapshot after the compaction starts and before it finishes and this new snapshot won't be repeatable, because after the compaction finishes, some keys may be dropped. * CompactionPri = kMinOverlappingRatio also uses compensated file size, which boosts file with lots of tombstones to be compacted first. @@ -1770,7 +1908,6 @@ Note: The next release will be major release 7.0. See https://github.com/faceboo * Add xxhash64 checksum support * Introduced `MemoryAllocator`, which lets the user specify custom memory allocator for block based table. * Improved `DeleteRange` to prevent read performance degradation. The feature is no longer marked as experimental. -* Enabled checkpoint on readonly db (DBImplReadOnly). ### Public API Change * `DBOptions::use_direct_reads` now affects reads issued by `BackupEngine` on the database's SSTs. @@ -1787,8 +1924,6 @@ Note: The next release will be major release 7.0. See https://github.com/faceboo * Fixed Get correctness bug in the presence of range tombstones where merge operands covered by a range tombstone always result in NotFound. * Start populating `NO_FILE_CLOSES` ticker statistic, which was always zero previously. * The default value of NewBloomFilterPolicy()'s argument use_block_based_builder is changed to false. Note that this new default may cause large temp memory usage when building very large SST files. -* Fix a deadlock caused by compaction and file ingestion waiting for each other in the event of write stalls. -* Make DB ignore dropped column families while committing results of atomic flush. ## 5.17.0 (2018-10-05) ### Public API Change @@ -1954,8 +2089,6 @@ Note: The next release will be major release 7.0. See https://github.com/faceboo * Provide lifetime hints when writing files on Linux. This reduces hardware write-amp on storage devices supporting multiple streams. * Add a DB stat, `NUMBER_ITER_SKIP`, which returns how many internal keys were skipped during iterations (e.g., due to being tombstones or duplicate versions of a key). * Add PerfContext counters, `key_lock_wait_count` and `key_lock_wait_time`, which measure the number of times transactions wait on key locks and total amount of time waiting. -* Support dynamically changing `ColumnFamilyOptions::compaction_options_universal`. -* Batch update stats at the end of each `Get`, rather than for each block cache access. ### Bug Fixes * Fix IOError on WAL write doesn't propagate to write group follower diff --git a/INSTALL.md b/INSTALL.md index fb4651e4b81..5bc5bd7b297 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -126,7 +126,6 @@ most processors made since roughly 2013. * Update XCode: run `xcode-select --install` (or install it from XCode App's settting). * Install via [homebrew](http://brew.sh/). * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line. - * run `brew tap homebrew/versions; brew install gcc7 --use-llvm` to install gcc 7 (or higher). * run `brew install rocksdb` * **FreeBSD** (11.01): @@ -169,21 +168,26 @@ most processors made since roughly 2013. * Install the dependencies for RocksDB: - pkg_add gmake gflags snappy bzip2 lz4 zstd git jdk bash findutils gnuwatch + `pkg_add gmake gflags snappy bzip2 lz4 zstd git bash findutils gnuwatch` * Build RocksDB from source: + ```bash cd ~ git clone https://github.com/facebook/rocksdb.git cd rocksdb gmake static_lib + ``` * Build RocksJava from source (optional): - + * In OpenBSD, JDK depends on XWindows system, so please check that you installed OpenBSD with `xbase` package. + * Install dependencies : `pkg_add -v jdk%1.8` + ```bash cd rocksdb export JAVA_HOME=/usr/local/jdk-1.8.0 export PATH=$PATH:/usr/local/jdk-1.8.0/bin - gmake rocksdbjava + gmake rocksdbjava SHA256_CMD='sha256 -q' + ``` * **iOS**: * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define an important pre-processing macros: `IOS_CROSS_COMPILE`. diff --git a/Makefile b/Makefile index 42433fdde98..8a9c884cd50 100644 --- a/Makefile +++ b/Makefile @@ -540,7 +540,8 @@ endif ifdef USE_CLANG # Used by some teams in Facebook - WARNING_FLAGS += -Wshift-sign-overflow -Wambiguous-reversed-operator + WARNING_FLAGS += -Wshift-sign-overflow -Wambiguous-reversed-operator \ + -Wimplicit-fallthrough -Wreinterpret-base-class -Wundefined-reinterpret-cast endif ifeq ($(PLATFORM), OS_OPENBSD) @@ -1642,6 +1643,9 @@ wal_edit_test: $(OBJ_DIR)/db/wal_edit_test.o $(TEST_LIBRARY) $(LIBRARY) dbformat_test: $(OBJ_DIR)/db/dbformat_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +multi_cf_iterator_test: $(OBJ_DIR)/db/multi_cf_iterator_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + env_basic_test: $(OBJ_DIR)/env/env_basic_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1771,6 +1775,9 @@ cuckoo_table_db_test: $(OBJ_DIR)/db/cuckoo_table_db_test.o $(TEST_LIBRARY) $(LIB listener_test: $(OBJ_DIR)/db/listener_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +string_util_test: $(OBJ_DIR)/util/string_util_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + thread_list_test: $(OBJ_DIR)/util/thread_list_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -2078,7 +2085,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux ifeq ($(PLATFORM), OS_SOLARIS) ARCH := $(shell isainfo -b) else ifeq ($(PLATFORM), OS_OPENBSD) - ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 sparc64 loongarch64, $(MACHINE))) + ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 riscv64 sparc64 loongarch64, $(MACHINE))) ARCH := 64 else ARCH := 32 @@ -2099,7 +2106,7 @@ ifneq ($(origin JNI_LIBC), undefined) endif ifeq (,$(ROCKSDBJNILIB)) -ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64 loongarch64, $(MACHINE))) +ifneq (,$(filter ppc% s390x arm64 aarch64 riscv64 sparc64 loongarch64, $(MACHINE))) ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so else ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so @@ -2112,8 +2119,8 @@ ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-javadoc.jar ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-sources.jar SHA256_CMD = sha256sum -ZLIB_VER ?= 1.3 -ZLIB_SHA256 ?= ff0ba4c292013dbc27530b3a81e1f9a813cd39de01ca5e0f8bf355702efa593e +ZLIB_VER ?= 1.3.1 +ZLIB_SHA256 ?= 9a93b2b7dfdac77ceba5a558a580e74667dd6fede4585b91eefb60f03b72df23 ZLIB_DOWNLOAD_BASE ?= http://zlib.net BZIP2_VER ?= 1.0.8 BZIP2_SHA256 ?= ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269 @@ -2346,43 +2353,47 @@ rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86: mkdir -p java/target - docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerx86_64: mkdir -p java/target - docker run --rm --name rocksdb_linux_x64-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + docker run --rm --name rocksdb_linux_x64-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerppc64le: mkdir -p java/target - docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerarm64v8: mkdir -p java/target - docker run --rm --name rocksdb_linux_arm64v8-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + docker run --rm --name rocksdb_linux_arm64v8-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockers390x: mkdir -p java/target - docker run --rm --name rocksdb_linux_s390x-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:ubuntu18_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + docker run --rm --name rocksdb_linux_s390x-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu18_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh + +rocksdbjavastaticdockerriscv64: + mkdir -p java/target + docker run --rm --name rocksdb_linux_riscv64-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu20_riscv64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerx86musl: mkdir -p java/target - docker run --rm --name rocksdb_linux_x86-musl-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh + docker run --rm --name rocksdb_linux_x86-musl-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerx86_64musl: mkdir -p java/target - docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh + docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerppc64lemusl: mkdir -p java/target - docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh + docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerarm64v8musl: mkdir -p java/target - docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh + docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockers390xmusl: mkdir -p java/target - docker run --rm --name rocksdb_linux_s390x-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh + docker run --rm --name rocksdb_linux_s390x-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral diff --git a/TARGETS b/TARGETS index a8f4ad0f993..77a7c16fd7e 100644 --- a/TARGETS +++ b/TARGETS @@ -100,6 +100,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "db/memtable_list.cc", "db/merge_helper.cc", "db/merge_operator.cc", + "db/multi_cf_iterator.cc", "db/output_validator.cc", "db/periodic_task_scheduler.cc", "db/range_del_aggregator.cc", @@ -5626,6 +5627,12 @@ cpp_unittest_wrapper(name="mock_env_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="multi_cf_iterator_test", + srcs=["db/multi_cf_iterator_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="object_registry_test", srcs=["utilities/object_registry_test.cc"], deps=[":rocksdb_test_lib"], diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index e4673613608..b56e298554b 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -163,24 +163,6 @@ case "$TARGET_OS" in PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic" fi PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt -ldl" - if test -z "$ROCKSDB_USE_IO_URING"; then - ROCKSDB_USE_IO_URING=1 - fi - if test "$ROCKSDB_USE_IO_URING" -ne 0; then - # check for liburing - $CXX $PLATFORM_CXXFLAGS -x c++ - -luring -o test.o 2>/dev/null < - int main() { - struct io_uring ring; - io_uring_queue_init(1, &ring, 0); - return 0; - } -EOF - if [ "$?" = 0 ]; then - PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -luring" - COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_IOURING_PRESENT" - fi - fi # PORT_FILES=port/linux/linux_specific.cc ;; SunOS) @@ -614,11 +596,29 @@ EOF fi fi + if test -z "$ROCKSDB_USE_IO_URING"; then + ROCKSDB_USE_IO_URING=1 + fi + if [ "$ROCKSDB_USE_IO_URING" -ne 0 -a "$PLATFORM" = OS_LINUX ]; then + # check for liburing + $CXX $PLATFORM_CXXFLAGS -x c++ - -luring -o test.o 2>/dev/null < + int main() { + struct io_uring ring; + io_uring_queue_init(1, &ring, 0); + return 0; + } +EOF + if [ "$?" = 0 ]; then + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -luring" + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_IOURING_PRESENT" + fi + fi fi # TODO(tec): Fix -Wshorten-64-to-32 errors on FreeBSD and enable the warning. # -Wshorten-64-to-32 breaks compilation on FreeBSD aarch64 and i386 -if ! { [ "$TARGET_OS" = FreeBSD ] && [ "$TARGET_ARCHITECTURE" = arm64 -o "$TARGET_ARCHITECTURE" = i386 ]; }; then +if ! { [ "$TARGET_OS" = FreeBSD -o "$TARGET_OS" = OpenBSD ] && [ "$TARGET_ARCHITECTURE" = arm64 -o "$TARGET_ARCHITECTURE" = i386 ]; }; then # Test whether -Wshorten-64-to-32 is available $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -Wshorten-64-to-32 2>/dev/null < 0 @@ -420,7 +425,7 @@ class CacheBench { } } - ~CacheBench() {} + ~CacheBench() = default; void PopulateCache() { Random64 rnd(FLAGS_seed); diff --git a/cache/cache_test.cc b/cache/cache_test.cc index f21efc47a92..adc354a8f14 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -106,7 +106,7 @@ class CacheTest : public testing::Test, type_ = GetParam(); } - ~CacheTest() override {} + ~CacheTest() override = default; // These functions encode/decode keys in tests cases that use // int keys. @@ -766,7 +766,9 @@ TEST_P(CacheTest, OverCapacity) { std::string key = EncodeKey(i + 1); auto h = cache.Lookup(key); ASSERT_TRUE(h != nullptr); - if (h) cache.Release(h); + if (h) { + cache.Release(h); + } } // the cache is over capacity since nothing could be evicted @@ -777,7 +779,7 @@ TEST_P(CacheTest, OverCapacity) { if (IsHyperClock()) { // Make sure eviction is triggered. - ASSERT_OK(cache.Insert(EncodeKey(-1), nullptr, 1, &handles[0])); + ASSERT_OK(cache.Insert(EncodeKey(-1), nullptr, 1, handles.data())); // cache is under capacity now since elements were released ASSERT_GE(n, cache.get()->GetUsage()); diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index fd330d90d83..078b922dd31 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -51,14 +52,15 @@ inline uint64_t GetInitialCountdown(Cache::Priority priority) { switch (priority) { case Cache::Priority::HIGH: return ClockHandle::kHighCountdown; - default: - assert(false); - FALLTHROUGH_INTENDED; case Cache::Priority::LOW: return ClockHandle::kLowCountdown; case Cache::Priority::BOTTOM: return ClockHandle::kBottomCountdown; } + // Switch should have been exhaustive. + assert(false); + // For release build, fall back on something reasonable. + return ClockHandle::kLowCountdown; } inline void MarkEmpty(ClockHandle& h) { @@ -93,7 +95,8 @@ inline void Unref(const ClockHandle& h, uint64_t count = 1) { (void)old_meta; } -inline bool ClockUpdate(ClockHandle& h, bool* purgeable = nullptr) { +inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data, + bool* purgeable = nullptr) { uint64_t meta; if (purgeable) { assert(*purgeable == false); @@ -125,6 +128,7 @@ inline bool ClockUpdate(ClockHandle& h, bool* purgeable = nullptr) { (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask; if (acquire_count != release_count) { // Only clock update entries with no outstanding refs + data->seen_pinned_count++; return false; } if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) && @@ -148,6 +152,8 @@ inline bool ClockUpdate(ClockHandle& h, bool* purgeable = nullptr) { << ClockHandle::kStateShift) | (meta & ClockHandle::kHitBitMask))) { // Took ownership. + data->freed_charge += h.GetTotalCharge(); + data->freed_count += 1; return true; } else { // Compare-exchange failing probably @@ -355,6 +361,18 @@ void ConstApplyToEntriesRange(const Func& func, const HandleImpl* begin, } } +constexpr uint32_t kStrictCapacityLimitBit = 1u << 31; + +uint32_t SanitizeEncodeEecAndScl(int eviction_effort_cap, + bool strict_capacit_limit) { + eviction_effort_cap = std::max(int{1}, eviction_effort_cap); + eviction_effort_cap = + std::min(static_cast(~kStrictCapacityLimitBit), eviction_effort_cap); + uint32_t eec_and_scl = static_cast(eviction_effort_cap); + eec_and_scl |= strict_capacit_limit ? kStrictCapacityLimitBit : 0; + return eec_and_scl; +} + } // namespace void ClockHandleBasicData::FreeData(MemoryAllocator* allocator) const { @@ -384,17 +402,20 @@ HandleImpl* BaseClockTable::StandaloneInsert( template typename Table::HandleImpl* BaseClockTable::CreateStandalone( - ClockHandleBasicData& proto, size_t capacity, bool strict_capacity_limit, + ClockHandleBasicData& proto, size_t capacity, uint32_t eec_and_scl, bool allow_uncharged) { Table& derived = static_cast(*this); typename Table::InsertState state; derived.StartInsert(state); const size_t total_charge = proto.GetTotalCharge(); - if (strict_capacity_limit) { + // NOTE: we can use eec_and_scl as eviction_effort_cap below because + // strict_capacity_limit=true is supposed to disable the limit on eviction + // effort, and a large value effectively does that. + if (eec_and_scl & kStrictCapacityLimitBit) { Status s = ChargeUsageMaybeEvictStrict( total_charge, capacity, - /*need_evict_for_occupancy=*/false, state); + /*need_evict_for_occupancy=*/false, eec_and_scl, state); if (!s.ok()) { if (allow_uncharged) { proto.total_charge = 0; @@ -406,7 +427,7 @@ typename Table::HandleImpl* BaseClockTable::CreateStandalone( // Case strict_capacity_limit == false bool success = ChargeUsageMaybeEvictNonStrict
( total_charge, capacity, - /*need_evict_for_occupancy=*/false, state); + /*need_evict_for_occupancy=*/false, eec_and_scl, state); if (!success) { // Force the issue usage_.FetchAddRelaxed(total_charge); @@ -419,7 +440,7 @@ typename Table::HandleImpl* BaseClockTable::CreateStandalone( template Status BaseClockTable::ChargeUsageMaybeEvictStrict( size_t total_charge, size_t capacity, bool need_evict_for_occupancy, - typename Table::InsertState& state) { + uint32_t eviction_effort_cap, typename Table::InsertState& state) { if (total_charge > capacity) { return Status::MemoryLimit( "Cache entry too large for a single cache shard: " + @@ -444,7 +465,8 @@ Status BaseClockTable::ChargeUsageMaybeEvictStrict( } if (request_evict_charge > 0) { EvictionData data; - static_cast(this)->Evict(request_evict_charge, state, &data); + static_cast(this)->Evict(request_evict_charge, state, &data, + eviction_effort_cap); occupancy_.FetchSub(data.freed_count); if (LIKELY(data.freed_charge > need_evict_charge)) { assert(data.freed_count > 0); @@ -474,7 +496,7 @@ Status BaseClockTable::ChargeUsageMaybeEvictStrict( template inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict( size_t total_charge, size_t capacity, bool need_evict_for_occupancy, - typename Table::InsertState& state) { + uint32_t eviction_effort_cap, typename Table::InsertState& state) { // For simplicity, we consider that either the cache can accept the insert // with no evictions, or we must evict enough to make (at least) enough // space. It could lead to unnecessary failures or excessive evictions in @@ -510,7 +532,8 @@ inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict( } EvictionData data; if (need_evict_charge > 0) { - static_cast(this)->Evict(need_evict_charge, state, &data); + static_cast(this)->Evict(need_evict_charge, state, &data, + eviction_effort_cap); // Deal with potential occupancy deficit if (UNLIKELY(need_evict_for_occupancy) && data.freed_count == 0) { assert(data.freed_charge == 0); @@ -529,11 +552,7 @@ inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict( return true; } -void BaseClockTable::TrackAndReleaseEvictedEntry( - ClockHandle* h, BaseClockTable::EvictionData* data) { - data->freed_charge += h->GetTotalCharge(); - data->freed_count += 1; - +void BaseClockTable::TrackAndReleaseEvictedEntry(ClockHandle* h) { bool took_value_ownership = false; if (eviction_callback_) { // For key reconstructed from hash @@ -541,7 +560,7 @@ void BaseClockTable::TrackAndReleaseEvictedEntry( took_value_ownership = eviction_callback_(ClockCacheShard::ReverseHash( h->GetHash(), &unhashed, hash_seed_), - reinterpret_cast(h), + static_cast(h), h->meta.LoadRelaxed() & ClockHandle::kHitBitMask); } if (!took_value_ownership) { @@ -550,11 +569,20 @@ void BaseClockTable::TrackAndReleaseEvictedEntry( MarkEmpty(*h); } +bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data, + uint32_t eviction_effort_cap) { + // Basically checks whether the ratio of useful effort to wasted effort is + // too low, with a start-up allowance for wasted effort before any useful + // effort. + return (data.freed_count + 1U) * uint64_t{eviction_effort_cap} <= + data.seen_pinned_count; +} + template Status BaseClockTable::Insert(const ClockHandleBasicData& proto, typename Table::HandleImpl** handle, Cache::Priority priority, size_t capacity, - bool strict_capacity_limit) { + uint32_t eec_and_scl) { using HandleImpl = typename Table::HandleImpl; Table& derived = static_cast(*this); @@ -572,9 +600,12 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto, // strict_capacity_limit, but mostly pessimistic. bool use_standalone_insert = false; const size_t total_charge = proto.GetTotalCharge(); - if (strict_capacity_limit) { + // NOTE: we can use eec_and_scl as eviction_effort_cap below because + // strict_capacity_limit=true is supposed to disable the limit on eviction + // effort, and a large value effectively does that. + if (eec_and_scl & kStrictCapacityLimitBit) { Status s = ChargeUsageMaybeEvictStrict
( - total_charge, capacity, need_evict_for_occupancy, state); + total_charge, capacity, need_evict_for_occupancy, eec_and_scl, state); if (!s.ok()) { // Revert occupancy occupancy_.FetchSubRelaxed(1); @@ -583,7 +614,7 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto, } else { // Case strict_capacity_limit == false bool success = ChargeUsageMaybeEvictNonStrict
( - total_charge, capacity, need_evict_for_occupancy, state); + total_charge, capacity, need_evict_for_occupancy, eec_and_scl, state); if (!success) { // Revert occupancy occupancy_.FetchSubRelaxed(1); @@ -687,8 +718,7 @@ void BaseClockTable::TEST_ReleaseNMinus1(ClockHandle* h, size_t n) { #endif FixedHyperClockTable::FixedHyperClockTable( - size_t capacity, bool /*strict_capacity_limit*/, - CacheMetadataChargePolicy metadata_charge_policy, + size_t capacity, CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, const Opts& opts) @@ -1083,7 +1113,8 @@ inline void FixedHyperClockTable::ReclaimEntryUsage(size_t total_charge) { } inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&, - EvictionData* data) { + EvictionData* data, + uint32_t eviction_effort_cap) { // precondition assert(requested_charge > 0); @@ -1104,10 +1135,10 @@ inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&, for (;;) { for (size_t i = 0; i < step_size; i++) { HandleImpl& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))]; - bool evicting = ClockUpdate(h); + bool evicting = ClockUpdate(h, data); if (evicting) { Rollback(h.hashed_key, &h); - TrackAndReleaseEvictedEntry(&h, data); + TrackAndReleaseEvictedEntry(&h); } } @@ -1118,6 +1149,10 @@ inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&, if (old_clock_pointer >= max_clock_pointer) { return; } + if (IsEvictionEffortExceeded(*data, eviction_effort_cap)) { + eviction_effort_exceeded_count_.FetchAddRelaxed(1); + return; + } // Advance clock pointer (concurrently) old_clock_pointer = clock_pointer_.FetchAddRelaxed(step_size); @@ -1132,10 +1167,11 @@ ClockCacheShard
::ClockCacheShard( const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, const typename Table::Opts& opts) : CacheShardBase(metadata_charge_policy), - table_(capacity, strict_capacity_limit, metadata_charge_policy, allocator, - eviction_callback, hash_seed, opts), + table_(capacity, metadata_charge_policy, allocator, eviction_callback, + hash_seed, opts), capacity_(capacity), - strict_capacity_limit_(strict_capacity_limit) { + eec_and_scl_(SanitizeEncodeEecAndScl(opts.eviction_effort_cap, + strict_capacity_limit)) { // Initial charge metadata should not exceed capacity assert(table_.GetUsage() <= capacity_.LoadRelaxed() || capacity_.LoadRelaxed() < sizeof(HandleImpl)); @@ -1211,7 +1247,11 @@ void ClockCacheShard
::SetCapacity(size_t capacity) { template void ClockCacheShard
::SetStrictCapacityLimit( bool strict_capacity_limit) { - strict_capacity_limit_.StoreRelaxed(strict_capacity_limit); + if (strict_capacity_limit) { + eec_and_scl_.FetchOrRelaxed(kStrictCapacityLimitBit); + } else { + eec_and_scl_.FetchAndRelaxed(~kStrictCapacityLimitBit); + } // next Insert will take care of any necessary evictions } @@ -1233,7 +1273,7 @@ Status ClockCacheShard
::Insert(const Slice& key, proto.total_charge = charge; return table_.template Insert
(proto, handle, priority, capacity_.LoadRelaxed(), - strict_capacity_limit_.LoadRelaxed()); + eec_and_scl_.LoadRelaxed()); } template @@ -1248,9 +1288,9 @@ typename Table::HandleImpl* ClockCacheShard
::CreateStandalone( proto.value = obj; proto.helper = helper; proto.total_charge = charge; - return table_.template CreateStandalone
( - proto, capacity_.LoadRelaxed(), strict_capacity_limit_.LoadRelaxed(), - allow_uncharged); + return table_.template CreateStandalone
(proto, capacity_.LoadRelaxed(), + eec_and_scl_.LoadRelaxed(), + allow_uncharged); } template @@ -1388,19 +1428,19 @@ BaseHyperClockCache
::BaseHyperClockCache( template Cache::ObjectPtr BaseHyperClockCache
::Value(Handle* handle) { - return reinterpret_cast(handle)->value; + return static_cast(handle)->value; } template size_t BaseHyperClockCache
::GetCharge(Handle* handle) const { - return reinterpret_cast(handle) + return static_cast(handle) ->GetTotalCharge(); } template const Cache::CacheItemHelper* BaseHyperClockCache
::GetCacheItemHelper( Handle* handle) const { - auto h = reinterpret_cast(handle); + auto h = static_cast(handle); return h->helper; } @@ -1502,14 +1542,20 @@ void BaseHyperClockCache
::ReportProblems( const std::shared_ptr& info_log) const { if (info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) { LoadVarianceStats slot_stats; + uint64_t eviction_effort_exceeded_count = 0; this->ForEachShard([&](const BaseHyperClockCache
::Shard* shard) { size_t count = shard->GetTableAddressCount(); for (size_t i = 0; i < count; ++i) { slot_stats.Add(IsSlotOccupied(*shard->GetTable().HandlePtr(i))); } + eviction_effort_exceeded_count += + shard->GetTable().GetEvictionEffortExceededCount(); }); ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL, "Slot occupancy stats: %s", slot_stats.Report().c_str()); + ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL, + "Eviction effort exceeded: %" PRIu64, + eviction_effort_exceeded_count); } } @@ -1907,8 +1953,7 @@ class AutoHyperClockTable::ChainRewriteLock { }; AutoHyperClockTable::AutoHyperClockTable( - size_t capacity, bool /*strict_capacity_limit*/, - CacheMetadataChargePolicy metadata_charge_policy, + size_t capacity, CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, const Opts& opts) @@ -2589,7 +2634,8 @@ using ClockUpdateChainLockedOpData = template void AutoHyperClockTable::PurgeImplLocked(OpData* op_data, ChainRewriteLock& rewrite_lock, - size_t home) { + size_t home, + BaseClockTable::EvictionData* data) { constexpr bool kIsPurge = std::is_same_v; constexpr bool kIsClockUpdateChain = std::is_same_v; @@ -2631,7 +2677,7 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data, assert(home == BottomNBits(h->hashed_key[1], home_shift)); if constexpr (kIsClockUpdateChain) { // Clock update and/or check for purgeable (under (de)construction) - if (ClockUpdate(*h, &purgeable)) { + if (ClockUpdate(*h, data, &purgeable)) { // Remember for finishing eviction op_data->push_back(h); // Entries for eviction become purgeable @@ -2641,6 +2687,7 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data, } } else { (void)op_data; + (void)data; purgeable = ((h->meta.Load() >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit) == 0; } @@ -2718,7 +2765,8 @@ using PurgeOpData = const UniqueId64x2; using ClockUpdateChainOpData = ClockUpdateChainLockedOpData; template -void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home) { +void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home, + BaseClockTable::EvictionData* data) { // Early efforts to make AutoHCC fully wait-free ran into too many problems // that needed obscure and potentially inefficient work-arounds to have a // chance at working. @@ -2799,9 +2847,9 @@ void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home) { if (!rewrite_lock.IsEnd()) { if constexpr (kIsPurge) { PurgeLockedOpData* locked_op_data{}; - PurgeImplLocked(locked_op_data, rewrite_lock, home); + PurgeImplLocked(locked_op_data, rewrite_lock, home, data); } else { - PurgeImplLocked(op_data, rewrite_lock, home); + PurgeImplLocked(op_data, rewrite_lock, home, data); } } } @@ -3404,7 +3452,8 @@ void AutoHyperClockTable::EraseUnRefEntries() { } void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state, - EvictionData* data) { + EvictionData* data, + uint32_t eviction_effort_cap) { // precondition assert(requested_charge > 0); @@ -3462,12 +3511,12 @@ void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state, if (home >= used_length) { break; } - PurgeImpl(&to_finish_eviction, home); + PurgeImpl(&to_finish_eviction, home, data); } } for (HandleImpl* h : to_finish_eviction) { - TrackAndReleaseEvictedEntry(h, data); + TrackAndReleaseEvictedEntry(h); // NOTE: setting likely_empty_slot here can cause us to reduce the // portion of "at home" entries, probably because an evicted entry // is more likely to come back than a random new entry and would be @@ -3495,6 +3544,11 @@ void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state, if (old_clock_pointer + step_size >= max_clock_pointer) { return; } + + if (IsEvictionEffortExceeded(*data, eviction_effort_cap)) { + eviction_effort_exceeded_count_.FetchAddRelaxed(1); + return; + } } } diff --git a/cache/clock_cache.h b/cache/clock_cache.h index 3086e7e972f..7423fa1f417 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -297,7 +298,7 @@ class ClockCacheTest; // ----------------------------------------------------------------------- // -struct ClockHandleBasicData { +struct ClockHandleBasicData : public Cache::Handle { Cache::ObjectPtr value = nullptr; const Cache::CacheItemHelper* helper = nullptr; // A lossless, reversible hash of the fixed-size (16 byte) cache key. This @@ -374,6 +375,14 @@ struct ClockHandle : public ClockHandleBasicData { class BaseClockTable { public: + struct BaseOpts { + explicit BaseOpts(int _eviction_effort_cap) + : eviction_effort_cap(_eviction_effort_cap) {} + explicit BaseOpts(const HyperClockCacheOptions& opts) + : BaseOpts(opts.eviction_effort_cap) {} + int eviction_effort_cap; + }; + BaseClockTable(CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, @@ -386,13 +395,13 @@ class BaseClockTable { template typename Table::HandleImpl* CreateStandalone(ClockHandleBasicData& proto, size_t capacity, - bool strict_capacity_limit, + uint32_t eec_and_scl, bool allow_uncharged); template Status Insert(const ClockHandleBasicData& proto, typename Table::HandleImpl** handle, Cache::Priority priority, - size_t capacity, bool strict_capacity_limit); + size_t capacity, uint32_t eec_and_scl); void Ref(ClockHandle& handle); @@ -406,12 +415,17 @@ class BaseClockTable { uint64_t GetYieldCount() const { return yield_count_.LoadRelaxed(); } + uint64_t GetEvictionEffortExceededCount() const { + return eviction_effort_exceeded_count_.LoadRelaxed(); + } + struct EvictionData { size_t freed_charge = 0; size_t freed_count = 0; + size_t seen_pinned_count = 0; }; - void TrackAndReleaseEvictedEntry(ClockHandle* h, EvictionData* data); + void TrackAndReleaseEvictedEntry(ClockHandle* h); #ifndef NDEBUG // Acquire N references @@ -436,6 +450,7 @@ class BaseClockTable { template Status ChargeUsageMaybeEvictStrict(size_t total_charge, size_t capacity, bool need_evict_for_occupancy, + uint32_t eviction_effort_cap, typename Table::InsertState& state); // Helper for updating `usage_` for new entry with given `total_charge` @@ -449,6 +464,7 @@ class BaseClockTable { template bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, size_t capacity, bool need_evict_for_occupancy, + uint32_t eviction_effort_cap, typename Table::InsertState& state); protected: // data @@ -461,9 +477,15 @@ class BaseClockTable { RelaxedAtomic clock_pointer_{}; // Counter for number of times we yield to wait on another thread. + // It is normal for this to occur rarely in normal operation. // (Relaxed: a simple stat counter.) RelaxedAtomic yield_count_{}; + // Counter for number of times eviction effort cap is exceeded. + // It is normal for this to occur rarely in normal operation. + // (Relaxed: a simple stat counter.) + RelaxedAtomic eviction_effort_exceeded_count_{}; + // TODO: is this separation needed if we don't do background evictions? ALIGN_AS(CACHE_LINE_SIZE) // Number of elements in the table. @@ -517,17 +539,19 @@ class FixedHyperClockTable : public BaseClockTable { inline void SetStandalone() { standalone = true; } }; // struct HandleImpl - struct Opts { - explicit Opts(size_t _estimated_value_size) - : estimated_value_size(_estimated_value_size) {} - explicit Opts(const HyperClockCacheOptions& opts) { + struct Opts : public BaseOpts { + explicit Opts(size_t _estimated_value_size, int _eviction_effort_cap) + : BaseOpts(_eviction_effort_cap), + estimated_value_size(_estimated_value_size) {} + explicit Opts(const HyperClockCacheOptions& opts) + : BaseOpts(opts.eviction_effort_cap) { assert(opts.estimated_entry_charge > 0); estimated_value_size = opts.estimated_entry_charge; } size_t estimated_value_size; }; - FixedHyperClockTable(size_t capacity, bool strict_capacity_limit, + FixedHyperClockTable(size_t capacity, CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, @@ -549,7 +573,8 @@ class FixedHyperClockTable : public BaseClockTable { // Runs the clock eviction algorithm trying to reclaim at least // requested_charge. Returns how much is evicted, which could be less // if it appears impossible to evict the requested amount without blocking. - void Evict(size_t requested_charge, InsertState& state, EvictionData* data); + void Evict(size_t requested_charge, InsertState& state, EvictionData* data, + uint32_t eviction_effort_cap); HandleImpl* Lookup(const UniqueId64x2& hashed_key); @@ -803,18 +828,20 @@ class AutoHyperClockTable : public BaseClockTable { } }; // struct HandleImpl - struct Opts { - explicit Opts(size_t _min_avg_value_size) - : min_avg_value_size(_min_avg_value_size) {} + struct Opts : public BaseOpts { + explicit Opts(size_t _min_avg_value_size, int _eviction_effort_cap) + : BaseOpts(_eviction_effort_cap), + min_avg_value_size(_min_avg_value_size) {} - explicit Opts(const HyperClockCacheOptions& opts) { + explicit Opts(const HyperClockCacheOptions& opts) + : BaseOpts(opts.eviction_effort_cap) { assert(opts.estimated_entry_charge == 0); min_avg_value_size = opts.min_avg_entry_charge; } size_t min_avg_value_size; }; - AutoHyperClockTable(size_t capacity, bool strict_capacity_limit, + AutoHyperClockTable(size_t capacity, CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, @@ -841,7 +868,8 @@ class AutoHyperClockTable : public BaseClockTable { // Runs the clock eviction algorithm trying to reclaim at least // requested_charge. Returns how much is evicted, which could be less // if it appears impossible to evict the requested amount without blocking. - void Evict(size_t requested_charge, InsertState& state, EvictionData* data); + void Evict(size_t requested_charge, InsertState& state, EvictionData* data, + uint32_t eviction_effort_cap); HandleImpl* Lookup(const UniqueId64x2& hashed_key); @@ -906,7 +934,8 @@ class AutoHyperClockTable : public BaseClockTable { // with proper handling to ensure all existing data is seen even in the // presence of concurrent insertions, etc. (See implementation.) template - void PurgeImpl(OpData* op_data, size_t home = SIZE_MAX); + void PurgeImpl(OpData* op_data, size_t home = SIZE_MAX, + EvictionData* data = nullptr); // An RAII wrapper for locking a chain of entries for removals. See // implementation. @@ -916,7 +945,7 @@ class AutoHyperClockTable : public BaseClockTable { // implementation. template void PurgeImplLocked(OpData* op_data, ChainRewriteLock& rewrite_lock, - size_t home); + size_t home, EvictionData* data); // Update length_info_ as much as possible without waiting, given a known // usable (ready for inserts and lookups) grow_home. (Previous grow_homes @@ -1078,9 +1107,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { // (Relaxed: eventual consistency/update is OK) RelaxedAtomic capacity_; - // Whether to reject insertion if cache reaches its full capacity. + // Encodes eviction_effort_cap (bottom 31 bits) and strict_capacity_limit + // (top bit). See HyperClockCacheOptions::eviction_effort_cap etc. // (Relaxed: eventual consistency/update is OK) - RelaxedAtomic strict_capacity_limit_; + RelaxedAtomic eec_and_scl_; }; // class ClockCacheShard template diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc index b29670b7730..ef2417f8d81 100644 --- a/cache/compressed_secondary_cache.cc +++ b/cache/compressed_secondary_cache.cc @@ -26,12 +26,12 @@ CompressedSecondaryCache::CompressedSecondaryCache( cache_))), disable_cache_(opts.capacity == 0) {} -CompressedSecondaryCache::~CompressedSecondaryCache() {} +CompressedSecondaryCache::~CompressedSecondaryCache() = default; std::unique_ptr CompressedSecondaryCache::Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase, - bool& kept_in_sec_cache) { + Statistics* stats, bool& kept_in_sec_cache) { assert(helper); // This is a minor optimization. Its ok to skip it in TSAN in order to // avoid a false positive. @@ -51,6 +51,7 @@ std::unique_ptr CompressedSecondaryCache::Lookup( void* handle_value = cache_->Value(lru_handle); if (handle_value == nullptr) { cache_->Release(lru_handle, /*erase_if_last_ref=*/false); + RecordTick(stats, COMPRESSED_SECONDARY_CACHE_DUMMY_HITS); return nullptr; } @@ -137,6 +138,7 @@ std::unique_ptr CompressedSecondaryCache::Lookup( cache_->Release(lru_handle, /*erase_if_last_ref=*/false); } handle.reset(new CompressedSecondaryCacheResultHandle(value, charge)); + RecordTick(stats, COMPRESSED_SECONDARY_CACHE_HITS); return handle; } @@ -190,13 +192,13 @@ Status CompressedSecondaryCache::InsertInternal( type == kNoCompression && !cache_options_.do_not_compress_roles.Contains(helper->role)) { PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, data_size); - CompressionOptions compression_opts; CompressionContext compression_context(cache_options_.compression_type, - compression_opts); + cache_options_.compression_opts); uint64_t sample_for_compression{0}; CompressionInfo compression_info( - compression_opts, compression_context, CompressionDict::GetEmptyDict(), - cache_options_.compression_type, sample_for_compression); + cache_options_.compression_opts, compression_context, + CompressionDict::GetEmptyDict(), cache_options_.compression_type, + sample_for_compression); bool success = CompressData(val, compression_info, @@ -289,6 +291,11 @@ std::string CompressedSecondaryCache::GetPrintableOptions() const { snprintf(buffer, kBufferSize, " compression_type : %s\n", CompressionTypeToString(cache_options_.compression_type).c_str()); ret.append(buffer); + snprintf(buffer, kBufferSize, " compression_opts : %s\n", + CompressionOptionsToString( + const_cast(cache_options_.compression_opts)) + .c_str()); + ret.append(buffer); snprintf(buffer, kBufferSize, " compress_format_version : %d\n", cache_options_.compress_format_version); ret.append(buffer); @@ -377,7 +384,7 @@ const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper( chunks_head = chunks_head->next; tmp_chunk->Free(); obj = nullptr; - }; + } }}; return &kHelper; } else { diff --git a/cache/compressed_secondary_cache.h b/cache/compressed_secondary_cache.h index 32e6fd0df9b..90e134fcf51 100644 --- a/cache/compressed_secondary_cache.h +++ b/cache/compressed_secondary_cache.h @@ -86,7 +86,7 @@ class CompressedSecondaryCache : public SecondaryCache { std::unique_ptr Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase, - bool& kept_in_sec_cache) override; + Statistics* stats, bool& kept_in_sec_cache) override; bool SupportForceErase() const override { return true; } diff --git a/cache/compressed_secondary_cache_test.cc b/cache/compressed_secondary_cache_test.cc index d72680b845e..058a80dd717 100644 --- a/cache/compressed_secondary_cache_test.cc +++ b/cache/compressed_secondary_cache_test.cc @@ -33,7 +33,7 @@ const std::string key3 = "____ ____key3"; class CompressedSecondaryCacheTestBase : public testing::Test, public WithCacheType { public: - CompressedSecondaryCacheTestBase() {} + CompressedSecondaryCacheTestBase() = default; ~CompressedSecondaryCacheTestBase() override = default; protected: @@ -44,7 +44,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test, // Lookup an non-existent key. std::unique_ptr handle0 = sec_cache->Lookup(key0, GetHelper(), this, true, /*advise_erase=*/true, - kept_in_sec_cache); + /*stats=*/nullptr, kept_in_sec_cache); ASSERT_EQ(handle0, nullptr); Random rnd(301); @@ -59,7 +59,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test, std::unique_ptr handle1_1 = sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false, - kept_in_sec_cache); + /*stats=*/nullptr, kept_in_sec_cache); ASSERT_EQ(handle1_1, nullptr); // Insert and Lookup the item k1 for the second time and advise erasing it. @@ -68,7 +68,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test, std::unique_ptr handle1_2 = sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true, - kept_in_sec_cache); + /*stats=*/nullptr, kept_in_sec_cache); ASSERT_NE(handle1_2, nullptr); ASSERT_FALSE(kept_in_sec_cache); if (sec_cache_is_compressed) { @@ -89,7 +89,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test, // Lookup the item k1 again. std::unique_ptr handle1_3 = sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true, - kept_in_sec_cache); + /*stats=*/nullptr, kept_in_sec_cache); ASSERT_EQ(handle1_3, nullptr); // Insert and Lookup the item k2. @@ -99,7 +99,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test, ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2); std::unique_ptr handle2_1 = sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false, - kept_in_sec_cache); + /*stats=*/nullptr, kept_in_sec_cache); ASSERT_EQ(handle2_1, nullptr); ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false)); @@ -115,7 +115,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test, } std::unique_ptr handle2_2 = sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false, - kept_in_sec_cache); + /*stats=*/nullptr, kept_in_sec_cache); ASSERT_NE(handle2_2, nullptr); std::unique_ptr val2 = std::unique_ptr(static_cast(handle2_2->Value())); @@ -196,14 +196,14 @@ class CompressedSecondaryCacheTestBase : public testing::Test, bool kept_in_sec_cache{false}; std::unique_ptr handle1 = sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false, - kept_in_sec_cache); + /*stats=*/nullptr, kept_in_sec_cache); ASSERT_EQ(handle1, nullptr); // Insert k2 and k1 is evicted. ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false)); std::unique_ptr handle2 = sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false, - kept_in_sec_cache); + /*stats=*/nullptr, kept_in_sec_cache); ASSERT_NE(handle2, nullptr); std::unique_ptr val2 = std::unique_ptr(static_cast(handle2->Value())); @@ -215,14 +215,14 @@ class CompressedSecondaryCacheTestBase : public testing::Test, std::unique_ptr handle1_1 = sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false, - kept_in_sec_cache); + /*stats=*/nullptr, kept_in_sec_cache); ASSERT_EQ(handle1_1, nullptr); // Create Fails. SetFailCreate(true); std::unique_ptr handle2_1 = sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/true, - kept_in_sec_cache); + /*stats=*/nullptr, kept_in_sec_cache); ASSERT_EQ(handle2_1, nullptr); // Save Fails. @@ -912,9 +912,9 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, EntryRoles) { ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1U); bool kept_in_sec_cache{true}; - std::unique_ptr handle = - sec_cache->Lookup(ith_key, GetHelper(role), this, true, - /*advise_erase=*/true, kept_in_sec_cache); + std::unique_ptr handle = sec_cache->Lookup( + ith_key, GetHelper(role), this, true, + /*advise_erase=*/true, /*stats=*/nullptr, kept_in_sec_cache); ASSERT_NE(handle, nullptr); // Lookup returns the right data @@ -992,6 +992,8 @@ class CompressedSecCacheTestWithTiered /*_capacity=*/0, /*_estimated_entry_charge=*/256 << 10, /*_num_shard_bits=*/0); + // eviction_effort_cap setting simply to avoid churn in existing test + hcc_opts.eviction_effort_cap = 100; TieredCacheOptions opts; lru_opts.capacity = 0; lru_opts.num_shard_bits = 0; @@ -1060,7 +1062,7 @@ bool CacheUsageWithinBounds(size_t val1, size_t val2, size_t error) { TEST_P(CompressedSecCacheTestWithTiered, CacheReservationManager) { CompressedSecondaryCache* sec_cache = - reinterpret_cast(GetSecondaryCache()); + static_cast(GetSecondaryCache()); // Use EXPECT_PRED3 instead of EXPECT_NEAR to void too many size_t to // double explicit casts @@ -1083,7 +1085,7 @@ TEST_P(CompressedSecCacheTestWithTiered, CacheReservationManager) { TEST_P(CompressedSecCacheTestWithTiered, CacheReservationManagerMultipleUpdate) { CompressedSecondaryCache* sec_cache = - reinterpret_cast(GetSecondaryCache()); + static_cast(GetSecondaryCache()); EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20), GetPercent(30 << 20, 1)); @@ -1169,7 +1171,7 @@ TEST_P(CompressedSecCacheTestWithTiered, AdmissionPolicy) { TEST_P(CompressedSecCacheTestWithTiered, DynamicUpdate) { CompressedSecondaryCache* sec_cache = - reinterpret_cast(GetSecondaryCache()); + static_cast(GetSecondaryCache()); std::shared_ptr tiered_cache = GetTieredCache(); // Use EXPECT_PRED3 instead of EXPECT_NEAR to void too many size_t to @@ -1233,7 +1235,7 @@ TEST_P(CompressedSecCacheTestWithTiered, DynamicUpdate) { TEST_P(CompressedSecCacheTestWithTiered, DynamicUpdateWithReservation) { CompressedSecondaryCache* sec_cache = - reinterpret_cast(GetSecondaryCache()); + static_cast(GetSecondaryCache()); std::shared_ptr tiered_cache = GetTieredCache(); ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(10 << 20)); @@ -1327,7 +1329,7 @@ TEST_P(CompressedSecCacheTestWithTiered, DynamicUpdateWithReservation) { TEST_P(CompressedSecCacheTestWithTiered, ReservationOverCapacity) { CompressedSecondaryCache* sec_cache = - reinterpret_cast(GetSecondaryCache()); + static_cast(GetSecondaryCache()); std::shared_ptr tiered_cache = GetTieredCache(); ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(110 << 20)); diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc index 9d169522434..79c46bcc5c0 100644 --- a/cache/lru_cache.cc +++ b/cache/lru_cache.cc @@ -277,8 +277,8 @@ void LRUCacheShard::LRU_Insert(LRUHandle* e) { e->SetInHighPriPool(false); e->SetInLowPriPool(true); low_pri_pool_usage_ += e->total_charge; - MaintainPoolSize(); lru_low_pri_ = e; + MaintainPoolSize(); } else { // Insert "e" to the head of bottom-pri pool. e->next = lru_bottom_pri_->next; @@ -301,6 +301,7 @@ void LRUCacheShard::MaintainPoolSize() { // Overflow last entry in high-pri pool to low-pri pool. lru_low_pri_ = lru_low_pri_->next; assert(lru_low_pri_ != &lru_); + assert(lru_low_pri_->InHighPriPool()); lru_low_pri_->SetInHighPriPool(false); lru_low_pri_->SetInLowPriPool(true); assert(high_pri_pool_usage_ >= lru_low_pri_->total_charge); @@ -312,6 +313,7 @@ void LRUCacheShard::MaintainPoolSize() { // Overflow last entry in low-pri pool to bottom-pri pool. lru_bottom_pri_ = lru_bottom_pri_->next; assert(lru_bottom_pri_ != &lru_); + assert(lru_bottom_pri_->InLowPriPool()); lru_bottom_pri_->SetInHighPriPool(false); lru_bottom_pri_->SetInLowPriPool(false); assert(low_pri_pool_usage_ >= lru_bottom_pri_->total_charge); @@ -339,8 +341,7 @@ void LRUCacheShard::NotifyEvicted( MemoryAllocator* alloc = table_.GetAllocator(); for (LRUHandle* entry : evicted_handles) { if (eviction_callback_ && - eviction_callback_(entry->key(), - reinterpret_cast(entry), + eviction_callback_(entry->key(), static_cast(entry), entry->HasHit())) { // Callback took ownership of obj; just free handle free(entry); @@ -506,7 +507,7 @@ bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/, // Only call eviction callback if we're sure no one requested erasure // FIXME: disabled because of test churn if (false && was_in_cache && !erase_if_last_ref && eviction_callback_ && - eviction_callback_(e->key(), reinterpret_cast(e), + eviction_callback_(e->key(), static_cast(e), e->HasHit())) { // Callback took ownership of obj; just free handle free(e); @@ -661,18 +662,18 @@ LRUCache::LRUCache(const LRUCacheOptions& opts) : ShardedCache(opts) { } Cache::ObjectPtr LRUCache::Value(Handle* handle) { - auto h = reinterpret_cast(handle); + auto h = static_cast(handle); return h->value; } size_t LRUCache::GetCharge(Handle* handle) const { - return reinterpret_cast(handle)->GetCharge( + return static_cast(handle)->GetCharge( GetShard(0).metadata_charge_policy_); } const Cache::CacheItemHelper* LRUCache::GetCacheItemHelper( Handle* handle) const { - auto h = reinterpret_cast(handle); + auto h = static_cast(handle); return h->helper; } diff --git a/cache/lru_cache.h b/cache/lru_cache.h index 1a9ba044251..045480fbcf1 100644 --- a/cache/lru_cache.h +++ b/cache/lru_cache.h @@ -47,7 +47,7 @@ namespace lru_cache { // LRUCacheShard::Lookup. // While refs > 0, public properties like value and deleter must not change. -struct LRUHandle { +struct LRUHandle : public Cache::Handle { Cache::ObjectPtr value; const Cache::CacheItemHelper* helper; LRUHandle* next_hash; diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index 27fd5cc854b..19b5983ab9b 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -32,7 +32,7 @@ namespace ROCKSDB_NAMESPACE { class LRUCacheTest : public testing::Test { public: - LRUCacheTest() {} + LRUCacheTest() = default; ~LRUCacheTest() override { DeleteCache(); } void DeleteCache() { @@ -47,7 +47,7 @@ class LRUCacheTest : public testing::Test { double low_pri_pool_ratio = 1.0, bool use_adaptive_mutex = kDefaultToAdaptiveMutex) { DeleteCache(); - cache_ = reinterpret_cast( + cache_ = static_cast( port::cacheline_aligned_alloc(sizeof(LRUCacheShard))); new (cache_) LRUCacheShard(capacity, /*strict_capacity_limit=*/false, high_pri_pool_ratio, low_pri_pool_ratio, @@ -57,10 +57,11 @@ class LRUCacheTest : public testing::Test { } void Insert(const std::string& key, - Cache::Priority priority = Cache::Priority::LOW) { + Cache::Priority priority = Cache::Priority::LOW, + size_t charge = 1) { EXPECT_OK(cache_->Insert(key, 0 /*hash*/, nullptr /*value*/, - &kNoopCacheItemHelper, 1 /*charge*/, - nullptr /*handle*/, priority)); + &kNoopCacheItemHelper, charge, nullptr /*handle*/, + priority)); } void Insert(char key, Cache::Priority priority = Cache::Priority::LOW) { @@ -144,8 +145,10 @@ class LRUCacheTest : public testing::Test { ASSERT_EQ(num_bottom_pri_pool_keys, bottom_pri_pool_keys); } - private: + protected: LRUCacheShard* cache_ = nullptr; + + private: Cache::EvictionCallback eviction_callback_; }; @@ -378,7 +381,7 @@ class ClockCacheTest : public testing::Test { using Table = typename Shard::Table; using TableOpts = typename Table::Opts; - ClockCacheTest() {} + ClockCacheTest() = default; ~ClockCacheTest() override { DeleteShard(); } void DeleteShard() { @@ -389,12 +392,12 @@ class ClockCacheTest : public testing::Test { } } - void NewShard(size_t capacity, bool strict_capacity_limit = true) { + void NewShard(size_t capacity, bool strict_capacity_limit = true, + int eviction_effort_cap = 30) { DeleteShard(); - shard_ = - reinterpret_cast(port::cacheline_aligned_alloc(sizeof(Shard))); + shard_ = static_cast(port::cacheline_aligned_alloc(sizeof(Shard))); - TableOpts opts{1 /*value_size*/}; + TableOpts opts{1 /*value_size*/, eviction_effort_cap}; new (shard_) Shard(capacity, strict_capacity_limit, kDontChargeCacheMetadata, /*allocator*/ nullptr, &eviction_callback_, &hash_seed_, opts); @@ -445,12 +448,20 @@ class ClockCacheTest : public testing::Test { return Slice(reinterpret_cast(&hashed_key), 16U); } + // A bad hash function for testing / stressing collision handling static inline UniqueId64x2 TestHashedKey(char key) { // For testing hash near-collision behavior, put the variance in // hashed_key in bits that are unlikely to be used as hash bits. return {(static_cast(key) << 56) + 1234U, 5678U}; } + // A reasonable hash function, for testing "typical behavior" etc. + template + static inline UniqueId64x2 CheapHash(T i) { + return {static_cast(i) * uint64_t{0x85EBCA77C2B2AE63}, + static_cast(i) * uint64_t{0xC2B2AE3D27D4EB4F}}; + } + Shard* shard_ = nullptr; private: @@ -683,6 +694,53 @@ TYPED_TEST(ClockCacheTest, ClockEvictionTest) { } } +TYPED_TEST(ClockCacheTest, ClockEvictionEffortCapTest) { + using HandleImpl = typename ClockCacheTest::Shard::HandleImpl; + for (bool strict_capacity_limit : {true, false}) { + SCOPED_TRACE("strict_capacity_limit = " + + std::to_string(strict_capacity_limit)); + for (int eec : {-42, 0, 1, 10, 100, 1000}) { + SCOPED_TRACE("eviction_effort_cap = " + std::to_string(eec)); + constexpr size_t kCapacity = 1000; + // Start with much larger capacity to ensure that we can go way over + // capacity without reaching table occupancy limit. + this->NewShard(3 * kCapacity, strict_capacity_limit, eec); + auto& shard = *this->shard_; + shard.SetCapacity(kCapacity); + + // Nearly fill the cache with pinned entries, then add a bunch of + // non-pinned entries. eviction_effort_cap should affect how many + // evictable entries are present beyond the cache capacity, despite + // being evictable. + constexpr size_t kCount = kCapacity - 1; + std::unique_ptr ha { new HandleImpl* [kCount] {} }; + for (size_t i = 0; i < 2 * kCount; ++i) { + UniqueId64x2 hkey = this->CheapHash(i); + ASSERT_OK(shard.Insert( + this->TestKey(hkey), hkey, nullptr /*value*/, &kNoopCacheItemHelper, + 1 /*charge*/, i < kCount ? &ha[i] : nullptr, Cache::Priority::LOW)); + } + + if (strict_capacity_limit) { + // If strict_capacity_limit is enabled, the cache will never exceed its + // capacity + EXPECT_EQ(shard.GetOccupancyCount(), kCapacity); + } else { + // Rough inverse relationship between cap and possible memory + // explosion, which shows up as increased table occupancy count. + int effective_eec = std::max(int{1}, eec) + 1; + EXPECT_NEAR(shard.GetOccupancyCount() * 1.0, + kCount * (1 + 1.4 / effective_eec), + kCount * (0.6 / effective_eec) + 1.0); + } + + for (size_t i = 0; i < kCount; ++i) { + shard.Release(ha[i]); + } + } + } +} + namespace { struct DeleteCounter { int deleted = 0; @@ -1035,7 +1093,8 @@ class TestSecondaryCache : public SecondaryCache { std::unique_ptr Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool /*wait*/, - bool /*advise_erase*/, bool& kept_in_sec_cache) override { + bool /*advise_erase*/, Statistics* /*stats*/, + bool& kept_in_sec_cache) override { std::string key_str = key.ToString(); TEST_SYNC_POINT_CALLBACK("TestSecondaryCache::Lookup", &key_str); @@ -1920,7 +1979,7 @@ TEST_P(BasicSecondaryCacheTest, BasicWaitAllTest) { ah.priority = Cache::Priority::LOW; cache->StartAsyncLookup(ah); } - cache->WaitAll(&async_handles[0], async_handles.size()); + cache->WaitAll(async_handles.data(), async_handles.size()); for (size_t i = 0; i < async_handles.size(); ++i) { SCOPED_TRACE("i = " + std::to_string(i)); Cache::Handle* result = async_handles[i].Result(); @@ -2647,6 +2706,23 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) { ASSERT_OK(DestroyDB(dbname2, options)); } +TEST_F(LRUCacheTest, InsertAfterReducingCapacity) { + // Fix a bug in LRU cache where it may try to remove a low pri entry's + // charge from high pri pool. It causes + // Assertion failed: (high_pri_pool_usage_ >= lru_low_pri_->total_charge), + // function MaintainPoolSize, file lru_cache.cc + NewCache(/*capacity=*/10, /*high_pri_pool_ratio=*/0.2, + /*low_pri_pool_ratio=*/0.8); + // high pri pool size and usage are both 2 + Insert("x", Cache::Priority::HIGH); + Insert("y", Cache::Priority::HIGH); + cache_->SetCapacity(5); + // high_pri_pool_size is 1, the next time we try to maintain pool size, + // we will move entries from high pri pool to low pri pool + // The bug was deducting this entry's charge from high pri pool usage. + Insert("aaa", Cache::Priority::LOW, /*charge=*/3); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/cache/secondary_cache_adapter.cc b/cache/secondary_cache_adapter.cc index b36f3a381b2..6261b8ce6e7 100644 --- a/cache/secondary_cache_adapter.cc +++ b/cache/secondary_cache_adapter.cc @@ -294,7 +294,8 @@ Cache::Handle* CacheWithSecondaryAdapter::Lookup(const Slice& key, bool kept_in_sec_cache = false; std::unique_ptr secondary_handle = secondary_cache_->Lookup(key, helper, create_context, /*wait*/ true, - found_dummy_entry, /*out*/ kept_in_sec_cache); + found_dummy_entry, stats, + /*out*/ kept_in_sec_cache); if (secondary_handle) { result = Promote(std::move(secondary_handle), key, helper, priority, stats, found_dummy_entry, kept_in_sec_cache); @@ -348,10 +349,10 @@ void CacheWithSecondaryAdapter::StartAsyncLookupOnMySecondary( assert(async_handle.result_handle == nullptr); std::unique_ptr secondary_handle = - secondary_cache_->Lookup(async_handle.key, async_handle.helper, - async_handle.create_context, /*wait*/ false, - async_handle.found_dummy_entry, - /*out*/ async_handle.kept_in_sec_cache); + secondary_cache_->Lookup( + async_handle.key, async_handle.helper, async_handle.create_context, + /*wait*/ false, async_handle.found_dummy_entry, async_handle.stats, + /*out*/ async_handle.kept_in_sec_cache); if (secondary_handle) { // TODO with stacked secondaries: Check & process if already ready? async_handle.pending_handle = secondary_handle.release(); @@ -683,12 +684,14 @@ std::shared_ptr NewTieredCache(const TieredCacheOptions& _opts) { *(static_cast_with_check( opts.cache_opts)); cache_opts.capacity = opts.total_capacity; + cache_opts.secondary_cache = nullptr; cache = cache_opts.MakeSharedCache(); } else if (opts.cache_type == PrimaryCacheType::kCacheTypeHCC) { HyperClockCacheOptions cache_opts = *(static_cast_with_check( opts.cache_opts)); cache_opts.capacity = opts.total_capacity; + cache_opts.secondary_cache = nullptr; cache = cache_opts.MakeSharedCache(); } else { return nullptr; diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h index b7ef723a184..6b32ba2dc90 100644 --- a/cache/sharded_cache.h +++ b/cache/sharded_cache.h @@ -139,7 +139,7 @@ class ShardedCache : public ShardedCacheBase { explicit ShardedCache(const ShardedCacheOptions& opts) : ShardedCacheBase(opts), - shards_(reinterpret_cast(port::cacheline_aligned_alloc( + shards_(static_cast(port::cacheline_aligned_alloc( sizeof(CacheShard) * GetNumShards()))), destroy_shards_in_dtor_(false) {} @@ -192,7 +192,7 @@ class ShardedCache : public ShardedCacheBase { HashVal hash = CacheShard::ComputeHash(key, hash_seed_); HandleImpl* result = GetShard(hash).CreateStandalone( key, hash, obj, helper, charge, allow_uncharged); - return reinterpret_cast(result); + return static_cast(result); } Handle* Lookup(const Slice& key, const CacheItemHelper* helper = nullptr, @@ -202,7 +202,7 @@ class ShardedCache : public ShardedCacheBase { HashVal hash = CacheShard::ComputeHash(key, hash_seed_); HandleImpl* result = GetShard(hash).Lookup(key, hash, helper, create_context, priority, stats); - return reinterpret_cast(result); + return static_cast(result); } void Erase(const Slice& key) override { @@ -212,11 +212,11 @@ class ShardedCache : public ShardedCacheBase { bool Release(Handle* handle, bool useful, bool erase_if_last_ref = false) override { - auto h = reinterpret_cast(handle); + auto h = static_cast(handle); return GetShard(h->GetHash()).Release(h, useful, erase_if_last_ref); } bool Ref(Handle* handle) override { - auto h = reinterpret_cast(handle); + auto h = static_cast(handle); return GetShard(h->GetHash()).Ref(h); } bool Release(Handle* handle, bool erase_if_last_ref = false) override { @@ -259,7 +259,7 @@ class ShardedCache : public ShardedCacheBase { } while (remaining_work); } - virtual void EraseUnRefEntries() override { + void EraseUnRefEntries() override { ForEachShard([](CacheShard* cs) { cs->EraseUnRefEntries(); }); } diff --git a/cache/tiered_secondary_cache.cc b/cache/tiered_secondary_cache.cc index 493e695722b..f7d5dd91d68 100644 --- a/cache/tiered_secondary_cache.cc +++ b/cache/tiered_secondary_cache.cc @@ -5,6 +5,8 @@ #include "cache/tiered_secondary_cache.h" +#include "monitoring/statistics_impl.h" + namespace ROCKSDB_NAMESPACE { // Creation callback for use in the lookup path. It calls the upper layer @@ -29,6 +31,9 @@ Status TieredSecondaryCache::MaybeInsertAndCreate( // TODO: Don't hardcode the source context->comp_sec_cache->InsertSaved(*context->key, data, type, source) .PermitUncheckedError(); + RecordTick(context->stats, COMPRESSED_SECONDARY_CACHE_PROMOTIONS); + } else { + RecordTick(context->stats, COMPRESSED_SECONDARY_CACHE_PROMOTION_SKIPS); } // Primary cache will accept the object, so call its helper to create // the object @@ -43,10 +48,10 @@ Status TieredSecondaryCache::MaybeInsertAndCreate( std::unique_ptr TieredSecondaryCache::Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool wait, bool advise_erase, - bool& kept_in_sec_cache) { + Statistics* stats, bool& kept_in_sec_cache) { bool dummy = false; std::unique_ptr result = - target()->Lookup(key, helper, create_context, wait, advise_erase, + target()->Lookup(key, helper, create_context, wait, advise_erase, stats, /*kept_in_sec_cache=*/dummy); // We never want the item to spill back into the secondary cache kept_in_sec_cache = true; @@ -66,9 +71,10 @@ std::unique_ptr TieredSecondaryCache::Lookup( ctx.helper = helper; ctx.inner_ctx = create_context; ctx.comp_sec_cache = target(); + ctx.stats = stats; return nvm_sec_cache_->Lookup(key, outer_helper, &ctx, wait, advise_erase, - kept_in_sec_cache); + stats, kept_in_sec_cache); } // If wait is false, i.e its an async lookup, we have to allocate a result @@ -80,8 +86,10 @@ std::unique_ptr TieredSecondaryCache::Lookup( handle->ctx()->helper = helper; handle->ctx()->inner_ctx = create_context; handle->ctx()->comp_sec_cache = target(); - handle->SetInnerHandle(nvm_sec_cache_->Lookup( - key, outer_helper, handle->ctx(), wait, advise_erase, kept_in_sec_cache)); + handle->ctx()->stats = stats; + handle->SetInnerHandle( + nvm_sec_cache_->Lookup(key, outer_helper, handle->ctx(), wait, + advise_erase, stats, kept_in_sec_cache)); if (!handle->inner_handle()) { handle.reset(); } else { @@ -109,10 +117,8 @@ void TieredSecondaryCache::WaitAll( } nvm_sec_cache_->WaitAll(nvm_handles); for (auto handle : my_handles) { - assert(handle->IsReady()); - auto nvm_handle = handle->inner_handle(); - handle->SetSize(nvm_handle->Size()); - handle->SetValue(nvm_handle->Value()); + assert(handle->inner_handle()->IsReady()); + handle->Complete(); } } diff --git a/cache/tiered_secondary_cache.h b/cache/tiered_secondary_cache.h index 6e05364367c..98c6fba9db3 100644 --- a/cache/tiered_secondary_cache.h +++ b/cache/tiered_secondary_cache.h @@ -42,27 +42,25 @@ class TieredSecondaryCache : public SecondaryCacheWrapper { // This is a no-op as we currently don't allow demotion (i.e // insertion by the upper layer) of evicted blocks. - virtual Status Insert(const Slice& /*key*/, Cache::ObjectPtr /*obj*/, - const Cache::CacheItemHelper* /*helper*/, - bool /*force_insert*/) override { + Status Insert(const Slice& /*key*/, Cache::ObjectPtr /*obj*/, + const Cache::CacheItemHelper* /*helper*/, + bool /*force_insert*/) override { return Status::OK(); } // Warm up the nvm tier directly - virtual Status InsertSaved( - const Slice& key, const Slice& saved, - CompressionType type = CompressionType::kNoCompression, - CacheTier source = CacheTier::kVolatileTier) override { + Status InsertSaved(const Slice& key, const Slice& saved, + CompressionType type = CompressionType::kNoCompression, + CacheTier source = CacheTier::kVolatileTier) override { return nvm_sec_cache_->InsertSaved(key, saved, type, source); } - virtual std::unique_ptr Lookup( + std::unique_ptr Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool wait, bool advise_erase, - bool& kept_in_sec_cache) override; + Statistics* stats, bool& kept_in_sec_cache) override; - virtual void WaitAll( - std::vector handles) override; + void WaitAll(std::vector handles) override; private: struct CreateContext : public Cache::CreateContext { @@ -72,6 +70,7 @@ class TieredSecondaryCache : public SecondaryCacheWrapper { Cache::CreateContext* inner_ctx; std::shared_ptr inner_handle; SecondaryCache* comp_sec_cache; + Statistics* stats; }; class ResultHandle : public SecondaryCacheResultHandle { @@ -79,7 +78,10 @@ class TieredSecondaryCache : public SecondaryCacheWrapper { ~ResultHandle() override {} bool IsReady() override { - return !inner_handle_ || inner_handle_->IsReady(); + if (inner_handle_ && inner_handle_->IsReady()) { + Complete(); + } + return ready_; } void Wait() override { @@ -92,10 +94,10 @@ class TieredSecondaryCache : public SecondaryCacheWrapper { Cache::ObjectPtr Value() override { return value_; } void Complete() { - assert(IsReady()); size_ = inner_handle_->Size(); value_ = inner_handle_->Value(); inner_handle_.reset(); + ready_ = true; } void SetInnerHandle(std::unique_ptr&& handle) { @@ -115,6 +117,7 @@ class TieredSecondaryCache : public SecondaryCacheWrapper { CreateContext ctx_; size_t size_; Cache::ObjectPtr value_; + bool ready_ = false; }; static void NoopDelete(Cache::ObjectPtr /*obj*/, diff --git a/cache/tiered_secondary_cache_test.cc b/cache/tiered_secondary_cache_test.cc index 9d8cdf7fb76..6a43b6dd526 100644 --- a/cache/tiered_secondary_cache_test.cc +++ b/cache/tiered_secondary_cache_test.cc @@ -15,10 +15,11 @@ namespace ROCKSDB_NAMESPACE { class TestSecondaryCache : public SecondaryCache { public: - explicit TestSecondaryCache(size_t capacity) + explicit TestSecondaryCache(size_t capacity, bool ready_before_wait) : cache_(NewLRUCache(capacity, 0, false, 0.5 /* high_pri_pool_ratio */, nullptr, kDefaultToAdaptiveMutex, kDontChargeCacheMetadata)), + ready_before_wait_(ready_before_wait), num_insert_saved_(0), num_hits_(0), num_misses_(0) {} @@ -61,7 +62,7 @@ class TestSecondaryCache : public SecondaryCache { std::unique_ptr Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool wait, bool /*advise_erase*/, - bool& kept_in_sec_cache) override { + Statistics* /*stats*/, bool& kept_in_sec_cache) override { std::string key_str = key.ToString(); TEST_SYNC_POINT_CALLBACK("TestSecondaryCache::Lookup", &key_str); @@ -88,7 +89,8 @@ class TestSecondaryCache : public SecondaryCache { /*alloc*/ nullptr, &value, &charge); if (s.ok()) { secondary_handle.reset(new TestSecondaryCacheResultHandle( - cache_.get(), handle, value, charge, /*ready=*/wait)); + cache_.get(), handle, value, charge, + /*ready=*/wait || ready_before_wait_)); kept_in_sec_cache = true; } else { cache_.Release(handle); @@ -168,6 +170,7 @@ class TestSecondaryCache : public SecondaryCache { BasicTypedSharedCacheInterface; using TypedHandle = SharedCache::TypedHandle; SharedCache cache_; + bool ready_before_wait_; uint32_t num_insert_saved_; uint32_t num_hits_; uint32_t num_misses_; @@ -179,11 +182,10 @@ class DBTieredSecondaryCacheTest : public DBTestBase { DBTieredSecondaryCacheTest() : DBTestBase("db_tiered_secondary_cache_test", /*env_do_fsync=*/true) {} - std::shared_ptr NewCache(size_t pri_capacity, - size_t compressed_capacity, - size_t nvm_capacity, - TieredAdmissionPolicy adm_policy = - TieredAdmissionPolicy::kAdmPolicyAuto) { + std::shared_ptr NewCache( + size_t pri_capacity, size_t compressed_capacity, size_t nvm_capacity, + TieredAdmissionPolicy adm_policy = TieredAdmissionPolicy::kAdmPolicyAuto, + bool ready_before_wait = false) { LRUCacheOptions lru_opts; TieredCacheOptions opts; lru_opts.capacity = 0; @@ -194,10 +196,11 @@ class DBTieredSecondaryCacheTest : public DBTestBase { opts.comp_cache_opts.capacity = 0; opts.comp_cache_opts.num_shard_bits = 0; opts.total_capacity = pri_capacity + compressed_capacity; - opts.compressed_secondary_ratio = + opts.compressed_secondary_ratio = compressed_secondary_ratio_ = (double)compressed_capacity / opts.total_capacity; if (nvm_capacity > 0) { - nvm_sec_cache_.reset(new TestSecondaryCache(nvm_capacity)); + nvm_sec_cache_.reset( + new TestSecondaryCache(nvm_capacity, ready_before_wait)); opts.nvm_sec_cache = nvm_sec_cache_; } opts.adm_policy = adm_policy; @@ -207,6 +210,12 @@ class DBTieredSecondaryCacheTest : public DBTestBase { return cache_; } + void ClearPrimaryCache() { + ASSERT_EQ(UpdateTieredCache(cache_, -1, 1.0), Status::OK()); + ASSERT_EQ(UpdateTieredCache(cache_, -1, compressed_secondary_ratio_), + Status::OK()); + } + TestSecondaryCache* nvm_sec_cache() { return nvm_sec_cache_.get(); } CompressedSecondaryCache* compressed_secondary_cache() { @@ -218,6 +227,7 @@ class DBTieredSecondaryCacheTest : public DBTestBase { private: std::shared_ptr cache_; std::shared_ptr nvm_sec_cache_; + double compressed_secondary_ratio_; }; // In this test, the block size is set to 4096. Each value is 1007 bytes, so @@ -376,7 +386,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicMultiGetTest) { keys.push_back(Key(8)); values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); ASSERT_EQ(values.size(), keys.size()); - for (auto value : values) { + for (const auto& value : values) { ASSERT_EQ(1007, value.size()); } ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 3u); @@ -390,7 +400,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicMultiGetTest) { keys.push_back(Key(20)); values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); ASSERT_EQ(values.size(), keys.size()); - for (auto value : values) { + for (const auto& value : values) { ASSERT_EQ(1007, value.size()); } ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); @@ -404,7 +414,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicMultiGetTest) { keys.push_back(Key(8)); values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); ASSERT_EQ(values.size(), keys.size()); - for (auto value : values) { + for (const auto& value : values) { ASSERT_EQ(1007, value.size()); } ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); @@ -418,7 +428,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicMultiGetTest) { keys.push_back(Key(8)); values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); ASSERT_EQ(values.size(), keys.size()); - for (auto value : values) { + for (const auto& value : values) { ASSERT_EQ(1007, value.size()); } ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); @@ -432,7 +442,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicMultiGetTest) { keys.push_back(Key(8)); values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); ASSERT_EQ(values.size(), keys.size()); - for (auto value : values) { + for (const auto& value : values) { ASSERT_EQ(1007, value.size()); } ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); @@ -446,7 +456,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicMultiGetTest) { keys.push_back(Key(20)); values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); ASSERT_EQ(values.size(), keys.size()); - for (auto value : values) { + for (const auto& value : values) { ASSERT_EQ(1007, value.size()); } ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); @@ -460,7 +470,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicMultiGetTest) { keys.push_back(Key(20)); values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); ASSERT_EQ(values.size(), keys.size()); - for (auto value : values) { + for (const auto& value : values) { ASSERT_EQ(1007, value.size()); } ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); @@ -474,7 +484,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicMultiGetTest) { keys.push_back(Key(20)); values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); ASSERT_EQ(values.size(), keys.size()); - for (auto value : values) { + for (const auto& value : values) { ASSERT_EQ(1007, value.size()); } ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); @@ -518,7 +528,7 @@ TEST_F(DBTieredSecondaryCacheTest, WaitAllTest) { keys.push_back(Key(8)); values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); ASSERT_EQ(values.size(), keys.size()); - for (auto value : values) { + for (const auto& value : values) { ASSERT_EQ(1007, value.size()); } ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 3u); @@ -532,7 +542,7 @@ TEST_F(DBTieredSecondaryCacheTest, WaitAllTest) { keys.push_back(Key(20)); values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); ASSERT_EQ(values.size(), keys.size()); - for (auto value : values) { + for (const auto& value : values) { ASSERT_EQ(1007, value.size()); } ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); @@ -551,7 +561,7 @@ TEST_F(DBTieredSecondaryCacheTest, WaitAllTest) { keys.push_back(Key(36)); values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); ASSERT_EQ(values.size(), keys.size()); - for (auto value : values) { + for (const auto& value : values) { ASSERT_EQ(1007, value.size()); } ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 10u); @@ -572,7 +582,7 @@ TEST_F(DBTieredSecondaryCacheTest, WaitAllTest) { keys.push_back(Key(8)); values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); ASSERT_EQ(values.size(), keys.size()); - for (auto value : values) { + for (const auto& value : values) { ASSERT_EQ(1007, value.size()); } ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 10u); @@ -582,6 +592,116 @@ TEST_F(DBTieredSecondaryCacheTest, WaitAllTest) { Destroy(options); } +TEST_F(DBTieredSecondaryCacheTest, ReadyBeforeWaitAllTest) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + return; + } + + BlockBasedTableOptions table_options; + table_options.block_cache = NewCache(250 * 1024, 20 * 1024, 256 * 1024, + TieredAdmissionPolicy::kAdmPolicyAuto, + /*ready_before_wait=*/true); + table_options.block_size = 4 * 1024; + table_options.cache_index_and_filter_blocks = false; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.statistics = CreateDBStatistics(); + + options.paranoid_file_checks = false; + DestroyAndReopen(options); + Random rnd(301); + const int N = 256; + for (int i = 0; i < N; i++) { + std::string p_v; + test::CompressibleString(&rnd, 0.5, 1007, &p_v); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + + std::vector keys; + std::vector values; + + keys.push_back(Key(0)); + keys.push_back(Key(4)); + keys.push_back(Key(8)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (const auto& value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 3u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 3u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 0u); + ASSERT_EQ(options.statistics->getTickerCount(BLOCK_CACHE_MISS), 3u); + + keys.clear(); + values.clear(); + keys.push_back(Key(12)); + keys.push_back(Key(16)); + keys.push_back(Key(20)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (const auto& value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 0u); + ASSERT_EQ(options.statistics->getTickerCount(BLOCK_CACHE_MISS), 6u); + + keys.clear(); + values.clear(); + keys.push_back(Key(0)); + keys.push_back(Key(4)); + keys.push_back(Key(8)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (const auto& value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 3u); + ASSERT_EQ(options.statistics->getTickerCount(BLOCK_CACHE_MISS), 6u); + + ClearPrimaryCache(); + + keys.clear(); + values.clear(); + keys.push_back(Key(0)); + keys.push_back(Key(32)); + keys.push_back(Key(36)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (const auto& value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 8u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 8u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 4u); + ASSERT_EQ(options.statistics->getTickerCount(BLOCK_CACHE_MISS), 8u); + + keys.clear(); + values.clear(); + keys.push_back(Key(0)); + keys.push_back(Key(32)); + keys.push_back(Key(36)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (const auto& value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 8u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 8u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 4u); + ASSERT_EQ(options.statistics->getTickerCount(BLOCK_CACHE_MISS), 8u); + + Destroy(options); +} + // This test is for iteration. It iterates through a set of keys in two // passes. First pass loads the compressed blocks into the nvm tier, and // the second pass should hit all of those blocks. diff --git a/cache/typed_cache.h b/cache/typed_cache.h index 125bfa0f506..5170c6048fb 100644 --- a/cache/typed_cache.h +++ b/cache/typed_cache.h @@ -155,7 +155,7 @@ class BasicTypedCacheInterface : public BaseCacheInterface, using BaseCacheInterface::BaseCacheInterface; struct TypedAsyncLookupHandle : public Cache::AsyncLookupHandle { TypedHandle* Result() { - return reinterpret_cast(Cache::AsyncLookupHandle::Result()); + return static_cast(Cache::AsyncLookupHandle::Result()); } }; @@ -169,8 +169,7 @@ class BasicTypedCacheInterface : public BaseCacheInterface, } inline TypedHandle* Lookup(const Slice& key, Statistics* stats = nullptr) { - return reinterpret_cast( - this->cache_->BasicLookup(key, stats)); + return static_cast(this->cache_->BasicLookup(key, stats)); } inline void StartAsyncLookup(TypedAsyncLookupHandle& async_handle) { @@ -347,7 +346,7 @@ class FullTypedCacheInterface Priority priority = Priority::LOW, Statistics* stats = nullptr, CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) { if (lowest_used_cache_tier > CacheTier::kVolatileTier) { - return reinterpret_cast(this->cache_->Lookup( + return static_cast(this->cache_->Lookup( key, GetFullHelper(), create_context, priority, stats)); } else { return BasicTypedCacheInterface::Lookup(key, diff --git a/cloud/aws/aws_kinesis.cc b/cloud/aws/aws_kinesis.cc index 333a6dc5ec0..0ff9d11c21c 100644 --- a/cloud/aws/aws_kinesis.cc +++ b/cloud/aws/aws_kinesis.cc @@ -61,6 +61,10 @@ class KinesisWritableFile : public CloudLogWritableFile { IODebugContext* dbg) override; IOStatus Close(const IOOptions& io_opts, IODebugContext* dbg) override; IOStatus LogDelete() override; + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return 0; + } private: std::shared_ptr kinesis_client_; diff --git a/cloud/aws/aws_s3.cc b/cloud/aws/aws_s3.cc index cc9916e7d3d..ce9cda635cd 100644 --- a/cloud/aws/aws_s3.cc +++ b/cloud/aws/aws_s3.cc @@ -856,13 +856,13 @@ class WritableFileStreamBuf : public std::streambuf { : fileCloseStatus_(fileCloseStatus), fileWriter_(std::move(fileWriter)) {} ~WritableFileStreamBuf() { - *fileCloseStatus_ = fileWriter_->Close(); + *fileCloseStatus_ = fileWriter_->Close({}); } protected: // Appends a block of data to the stream. Must always write n if possible std::streamsize xsputn(const char* s, std::streamsize n) override { - auto st = fileWriter_->Append(rocksdb::Slice(s, n)); + auto st = fileWriter_->Append({}, rocksdb::Slice(s, n)); if (!st.ok()) { return EOF; } @@ -885,7 +885,7 @@ class WritableFileStreamBuf : public std::streambuf { // Flushes any buffered data int sync() override { - auto st = fileWriter_->Flush(); + auto st = fileWriter_->Flush({}); return st.ok() ? 0 : -1; } diff --git a/cloud/cloud_file_system_impl.cc b/cloud/cloud_file_system_impl.cc index d534e86c1ec..895b1daf5b7 100644 --- a/cloud/cloud_file_system_impl.cc +++ b/cloud/cloud_file_system_impl.cc @@ -1586,7 +1586,7 @@ IOStatus CloudFileSystemImpl::LoadCloudManifest(const std::string& local_dbname, // Create dummy CURRENT file to point to the dummy manifest (cloud env // will remap the filename appropriately, this is just to fool the // underyling RocksDB) - st = SetCurrentFile(GetBaseFileSystem().get(), local_dbname, + st = SetCurrentFile(WriteOptions(), GetBaseFileSystem().get(), local_dbname, 1 /* descriptor_number */, nullptr /* dir_contains_current_file */); if (!st.ok()) { @@ -2014,8 +2014,9 @@ IOStatus CloudFileSystemImpl::RollNewCookie( // MANIFEST file will be cleaned up in DeleteInvisibleFiles(). auto st = CopyFile( base_fs.get(), ManifestFileWithEpoch(local_dbname, old_epoch), - ManifestFileWithEpoch(local_dbname, delta.epoch), 0 /* size */, - true /* use_fsync */, nullptr /* io_tracer */, Temperature::kUnknown); + Temperature::kUnknown, ManifestFileWithEpoch(local_dbname, delta.epoch), + Temperature::kUnknown, 0 /* size */, true /* use_fsync */, + nullptr /* io_tracer */); if (!st.ok()) { return st; } diff --git a/cloud/cloud_manifest.cc b/cloud/cloud_manifest.cc index 010d65ed1d5..5f7ba6f01f9 100644 --- a/cloud/cloud_manifest.cc +++ b/cloud/cloud_manifest.cc @@ -148,7 +148,7 @@ IOStatus CloudManifest::WriteToLog( // 1. write header PutVarint32(&record, kCurrentFormatVersion); PutVarint32(&record, static_cast(pastEpochs_.size() + 1)); - auto status = writer.AddRecord(record); + auto status = writer.AddRecord({}, record); if (!status.ok()) { return status; } @@ -159,7 +159,7 @@ IOStatus CloudManifest::WriteToLog( PutVarint32(&record, static_cast(RecordTags::kPastEpoch)); PutLengthPrefixedSlice(&record, pe.second); PutVarint64(&record, pe.first); - status = writer.AddRecord(record); + status = writer.AddRecord({}, record); if (!status.ok()) { return status; } @@ -170,11 +170,11 @@ IOStatus CloudManifest::WriteToLog( PutVarint32(&record, static_cast(RecordTags::kCurrentEpoch)); PutLengthPrefixedSlice(&record, currentEpoch_); - status = writer.AddRecord(record); + status = writer.AddRecord({}, record); if (!status.ok()) { return status; } - return writer.file()->Sync(true); + return writer.file()->Sync({}, true); } bool CloudManifest::AddEpoch(uint64_t startFileNumber, std::string epochId) { diff --git a/cloud/db_cloud_impl.cc b/cloud/db_cloud_impl.cc index c31c17d02f1..933be96db3a 100644 --- a/cloud/db_cloud_impl.cc +++ b/cloud/db_cloud_impl.cc @@ -316,7 +316,7 @@ Status DBCloudImpl::CheckpointToCloud(const BucketOptions& destination, const CheckpointToCloudOptions& options) { DisableFileDeletions(); auto st = DoCheckpointToCloud(destination, options); - EnableFileDeletions(false); + EnableFileDeletions(); return st; } @@ -339,8 +339,8 @@ Status DBCloudImpl::DoCheckpointToCloud( auto manifest_fname = ManifestFileWithEpoch("", current_epoch); auto tmp_manifest_fname = manifest_fname + ".tmp"; st = CopyFile(local_fs.get(), GetName() + "/" + manifest_fname, - GetName() + "/" + tmp_manifest_fname, manifest_file_size, false, - nullptr, Temperature::kUnknown); + Temperature::kUnknown, GetName() + "/" + tmp_manifest_fname, + Temperature::kUnknown, manifest_file_size, false, nullptr); if (!st.ok()) { return st; } diff --git a/cloud/db_cloud_test.cc b/cloud/db_cloud_test.cc index 7faaa5028e0..0cf5cbfeac9 100644 --- a/cloud/db_cloud_test.cc +++ b/cloud/db_cloud_test.cc @@ -2551,7 +2551,7 @@ TEST_F(CloudTest, DisableObsoleteFileDeletionOnOpenTest) { // obsolete files are not deleted EXPECT_EQ(GetAllLocalFiles().size(), 10); // obsolete files are deleted! - db_->EnableFileDeletions(false /* force */); + db_->EnableFileDeletions(); EXPECT_EQ(GetAllLocalFiles().size(), 8); CloseDB(); } diff --git a/cloud/replication_test.cc b/cloud/replication_test.cc index 343b0439852..b782fa104a3 100644 --- a/cloud/replication_test.cc +++ b/cloud/replication_test.cc @@ -1146,7 +1146,7 @@ TEST_F(ReplicationTest, NoMemSwitchRecordIfEmpty) { TEST_F(ReplicationTest, EvictObsoleteFiles) { auto leader = openLeader(); - leader->EnableFileDeletions(true); + leader->EnableFileDeletions(); auto followerOptions = leaderOptions(); followerOptions.disable_delete_obsolete_files_on_open = true; auto follower = openFollower(followerOptions); diff --git a/cmake/modules/Findzstd.cmake b/cmake/modules/Findzstd.cmake index 9430821df6e..e82fa148c8c 100644 --- a/cmake/modules/Findzstd.cmake +++ b/cmake/modules/Findzstd.cmake @@ -1,29 +1,29 @@ # - Find zstd # Find the zstd compression library and includes # -# zstd_INCLUDE_DIRS - where to find zstd.h, etc. -# zstd_LIBRARIES - List of libraries when using zstd. -# zstd_FOUND - True if zstd found. +# ZSTD_INCLUDE_DIRS - where to find zstd.h, etc. +# ZSTD_LIBRARIES - List of libraries when using zstd. +# ZSTD_FOUND - True if zstd found. -find_path(zstd_INCLUDE_DIRS +find_path(ZSTD_INCLUDE_DIRS NAMES zstd.h HINTS ${zstd_ROOT_DIR}/include) -find_library(zstd_LIBRARIES +find_library(ZSTD_LIBRARIES NAMES zstd HINTS ${zstd_ROOT_DIR}/lib) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(zstd DEFAULT_MSG zstd_LIBRARIES zstd_INCLUDE_DIRS) +find_package_handle_standard_args(zstd DEFAULT_MSG ZSTD_LIBRARIES ZSTD_INCLUDE_DIRS) mark_as_advanced( - zstd_LIBRARIES - zstd_INCLUDE_DIRS) + ZSTD_LIBRARIES + ZSTD_INCLUDE_DIRS) -if(zstd_FOUND AND NOT (TARGET zstd::zstd)) +if(ZSTD_FOUND AND NOT (TARGET zstd::zstd)) add_library (zstd::zstd UNKNOWN IMPORTED) set_target_properties(zstd::zstd PROPERTIES - IMPORTED_LOCATION ${zstd_LIBRARIES} - INTERFACE_INCLUDE_DIRECTORIES ${zstd_INCLUDE_DIRS}) + IMPORTED_LOCATION ${ZSTD_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${ZSTD_INCLUDE_DIRS}) endif() diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc index e6dcb669620..919ea566502 100644 --- a/db/arena_wrapped_db_iter.cc +++ b/db/arena_wrapped_db_iter.cc @@ -43,14 +43,13 @@ void ArenaWrappedDBIter::Init( Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Version* version, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration, - uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl, - ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) { + uint64_t version_number, ReadCallback* read_callback, + ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) { auto mem = arena_.AllocateAligned(sizeof(DBIter)); - db_iter_ = - new (mem) DBIter(env, read_options, ioptions, mutable_cf_options, - ioptions.user_comparator, /* iter */ nullptr, version, - sequence, true, max_sequential_skip_in_iteration, - read_callback, db_impl, cfd, expose_blob_index); + db_iter_ = new (mem) DBIter( + env, read_options, ioptions, mutable_cf_options, ioptions.user_comparator, + /* iter */ nullptr, version, sequence, true, + max_sequential_skip_in_iteration, read_callback, cfh, expose_blob_index); sv_number_ = version_number; read_options_ = read_options; allow_refresh_ = allow_refresh; @@ -65,40 +64,44 @@ void ArenaWrappedDBIter::Init( Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); } Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) { - if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) { + if (cfh_ == nullptr || !allow_refresh_) { return Status::NotSupported("Creating renew iterator is not allowed."); } assert(db_iter_ != nullptr); + auto cfd = cfh_->cfd(); + auto db_impl = cfh_->db(); + // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the // correct behavior. Will be corrected automatically when we take a snapshot // here for the case of WritePreparedTxnDB. - uint64_t cur_sv_number = cfd_->GetSuperVersionNumber(); + uint64_t cur_sv_number = cfd->GetSuperVersionNumber(); // If we recreate a new internal iterator below (NewInternalIterator()), // we will pass in read_options_. We need to make sure it // has the right snapshot. read_options_.snapshot = snapshot; TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1"); TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2"); + auto reinit_internal_iter = [&]() { Env* env = db_iter_->env(); db_iter_->~DBIter(); arena_.~Arena(); new (&arena_) Arena(); - SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_); + SuperVersion* sv = cfd->GetReferencedSuperVersion(db_impl); assert(sv->version_number >= cur_sv_number); - SequenceNumber read_seq = GetSeqNum(db_impl_, snapshot); + SequenceNumber read_seq = GetSeqNum(db_impl, snapshot); if (read_callback_) { read_callback_->Refresh(read_seq); } - Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options, + Init(env, read_options_, *(cfd->ioptions()), sv->mutable_cf_options, sv->current, read_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, read_callback_, db_impl_, cfd_, expose_blob_index_, + sv->version_number, read_callback_, cfh_, expose_blob_index_, allow_refresh_); - InternalIterator* internal_iter = db_impl_->NewInternalIterator( - read_options_, cfd_, sv, &arena_, read_seq, + InternalIterator* internal_iter = db_impl->NewInternalIterator( + read_options_, cfd, sv, &arena_, read_seq, /* allow_unprepared_value */ true, /* db_iter */ this); SetIterUnderDBIter(internal_iter); }; @@ -107,10 +110,10 @@ Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) { reinit_internal_iter(); break; } else { - SequenceNumber read_seq = GetSeqNum(db_impl_, snapshot); + SequenceNumber read_seq = GetSeqNum(db_impl, snapshot); // Refresh range-tombstones in MemTable if (!read_options_.ignore_range_deletions) { - SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_); + SuperVersion* sv = cfd->GetThreadLocalSuperVersion(db_impl); TEST_SYNC_POINT_CALLBACK("ArenaWrappedDBIter::Refresh:SV", nullptr); auto t = sv->mem->NewRangeTombstoneIterator( read_options_, read_seq, false /* immutable_memtable */); @@ -123,13 +126,13 @@ Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) { // will be freed during db_iter destruction there. if (memtable_range_tombstone_iter_) { assert(!*memtable_range_tombstone_iter_ || - sv_number_ != cfd_->GetSuperVersionNumber()); + sv_number_ != cfd->GetSuperVersionNumber()); } delete t; } else { // current mutable memtable has range tombstones if (!memtable_range_tombstone_iter_) { delete t; - db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv); + db_impl->ReturnAndCleanupSuperVersion(cfd, sv); // The memtable under DBIter did not have range tombstone before // refresh. reinit_internal_iter(); @@ -138,13 +141,13 @@ Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) { delete *memtable_range_tombstone_iter_; *memtable_range_tombstone_iter_ = new TruncatedRangeDelIterator( std::unique_ptr(t), - &cfd_->internal_comparator(), nullptr, nullptr); + &cfd->internal_comparator(), nullptr, nullptr); } } - db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv); + db_impl->ReturnAndCleanupSuperVersion(cfd, sv); } // Check again if the latest super version number is changed - uint64_t latest_sv_number = cfd_->GetSuperVersionNumber(); + uint64_t latest_sv_number = cfd->GetSuperVersionNumber(); if (latest_sv_number != cur_sv_number) { // If the super version number is changed after refreshing, // fallback to Re-Init the InternalIterator @@ -163,14 +166,14 @@ ArenaWrappedDBIter* NewArenaWrappedDbIterator( Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Version* version, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, - uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl, - ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) { + uint64_t version_number, ReadCallback* read_callback, + ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) { ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence, max_sequential_skip_in_iterations, version_number, read_callback, - db_impl, cfd, expose_blob_index, allow_refresh); - if (db_impl != nullptr && cfd != nullptr && allow_refresh) { - iter->StoreRefreshInfo(db_impl, cfd, read_callback, expose_blob_index); + cfh, expose_blob_index, allow_refresh); + if (cfh != nullptr && allow_refresh) { + iter->StoreRefreshInfo(cfh, read_callback, expose_blob_index); } return iter; diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h index d30ee45c3a4..8e84c33aa3c 100644 --- a/db/arena_wrapped_db_iter.h +++ b/db/arena_wrapped_db_iter.h @@ -87,15 +87,14 @@ class ArenaWrappedDBIter : public Iterator { const MutableCFOptions& mutable_cf_options, const Version* version, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, uint64_t version_number, - ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh); // Store some parameters so we can refresh the iterator at a later point // with these same params - void StoreRefreshInfo(DBImpl* db_impl, ColumnFamilyData* cfd, + void StoreRefreshInfo(ColumnFamilyHandleImpl* cfh, ReadCallback* read_callback, bool expose_blob_index) { - db_impl_ = db_impl; - cfd_ = cfd; + cfh_ = cfh; read_callback_ = read_callback; expose_blob_index_ = expose_blob_index; } @@ -111,8 +110,7 @@ class ArenaWrappedDBIter : public Iterator { DBIter* db_iter_ = nullptr; Arena arena_; uint64_t sv_number_; - ColumnFamilyData* cfd_ = nullptr; - DBImpl* db_impl_ = nullptr; + ColumnFamilyHandleImpl* cfh_ = nullptr; ReadOptions read_options_; ReadCallback* read_callback_; bool expose_blob_index_ = false; @@ -123,13 +121,13 @@ class ArenaWrappedDBIter : public Iterator { }; // Generate the arena wrapped iterator class. -// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not +// `cfh` is used for reneweal. If left null, renewal will not // be supported. -extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( +ArenaWrappedDBIter* NewArenaWrappedDbIterator( Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Version* version, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, uint64_t version_number, ReadCallback* read_callback, - DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr, - bool expose_blob_index = false, bool allow_refresh = true); + ColumnFamilyHandleImpl* cfh = nullptr, bool expose_blob_index = false, + bool allow_refresh = true); } // namespace ROCKSDB_NAMESPACE diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc index 35269fdb509..dceb90cee57 100644 --- a/db/blob/blob_file_builder.cc +++ b/db/blob/blob_file_builder.cc @@ -34,9 +34,9 @@ BlobFileBuilder::BlobFileBuilder( VersionSet* versions, FileSystem* fs, const ImmutableOptions* immutable_options, const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, - std::string db_id, std::string db_session_id, int job_id, - uint32_t column_family_id, const std::string& column_family_name, - Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const WriteOptions* write_options, std::string db_id, + std::string db_session_id, int job_id, uint32_t column_family_id, + const std::string& column_family_name, Env::WriteLifeTimeHint write_hint, const std::shared_ptr& io_tracer, BlobFileCompletionCallback* blob_callback, BlobFileCreationReason creation_reason, @@ -44,18 +44,18 @@ BlobFileBuilder::BlobFileBuilder( std::vector* blob_file_additions) : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs, immutable_options, mutable_cf_options, file_options, - db_id, db_session_id, job_id, column_family_id, - column_family_name, io_priority, write_hint, io_tracer, - blob_callback, creation_reason, blob_file_paths, - blob_file_additions) {} + write_options, db_id, db_session_id, job_id, + column_family_id, column_family_name, write_hint, + io_tracer, blob_callback, creation_reason, + blob_file_paths, blob_file_additions) {} BlobFileBuilder::BlobFileBuilder( std::function file_number_generator, FileSystem* fs, const ImmutableOptions* immutable_options, const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, - std::string db_id, std::string db_session_id, int job_id, - uint32_t column_family_id, const std::string& column_family_name, - Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const WriteOptions* write_options, std::string db_id, + std::string db_session_id, int job_id, uint32_t column_family_id, + const std::string& column_family_name, Env::WriteLifeTimeHint write_hint, const std::shared_ptr& io_tracer, BlobFileCompletionCallback* blob_callback, BlobFileCreationReason creation_reason, @@ -69,12 +69,12 @@ BlobFileBuilder::BlobFileBuilder( blob_compression_type_(mutable_cf_options->blob_compression_type), prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache), file_options_(file_options), + write_options_(write_options), db_id_(std::move(db_id)), db_session_id_(std::move(db_session_id)), job_id_(job_id), column_family_id_(column_family_id), column_family_name_(column_family_name), - io_priority_(io_priority), write_hint_(write_hint), io_tracer_(io_tracer), blob_callback_(blob_callback), @@ -87,6 +87,7 @@ BlobFileBuilder::BlobFileBuilder( assert(fs_); assert(immutable_options_); assert(file_options_); + assert(write_options_); assert(blob_file_paths_); assert(blob_file_paths_->empty()); assert(blob_file_additions_); @@ -207,14 +208,14 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() { blob_file_paths_->emplace_back(std::move(blob_file_path)); assert(file); - file->SetIOPriority(io_priority_); + file->SetIOPriority(write_options_->rate_limiter_priority); file->SetWriteLifeTimeHint(write_hint_); FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types; Statistics* const statistics = immutable_options_->stats; std::unique_ptr file_writer(new WritableFileWriter( std::move(file), blob_file_paths_->back(), *file_options_, immutable_options_->clock, io_tracer_, statistics, - immutable_options_->listeners, + Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS, immutable_options_->listeners, immutable_options_->file_checksum_gen_factory.get(), tmp_set.Contains(FileType::kBlobFile), false)); @@ -231,7 +232,7 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() { expiration_range); { - Status s = blob_log_writer->WriteHeader(header); + Status s = blob_log_writer->WriteHeader(*write_options_, header); TEST_SYNC_POINT_CALLBACK( "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s); @@ -296,7 +297,8 @@ Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob, uint64_t key_offset = 0; - Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset); + Status s = + writer_->AddRecord(*write_options_, key, blob, &key_offset, blob_offset); TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s); @@ -321,7 +323,8 @@ Status BlobFileBuilder::CloseBlobFile() { std::string checksum_method; std::string checksum_value; - Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value); + Status s = writer_->AppendFooter(*write_options_, footer, &checksum_method, + &checksum_value); TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s); diff --git a/db/blob/blob_file_builder.h b/db/blob/blob_file_builder.h index 8e7aab502d6..6ba7181aa09 100644 --- a/db/blob/blob_file_builder.h +++ b/db/blob/blob_file_builder.h @@ -13,6 +13,7 @@ #include "rocksdb/advanced_options.h" #include "rocksdb/compression_type.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" #include "rocksdb/rocksdb_namespace.h" #include "rocksdb/types.h" @@ -36,11 +37,11 @@ class BlobFileBuilder { BlobFileBuilder(VersionSet* versions, FileSystem* fs, const ImmutableOptions* immutable_options, const MutableCFOptions* mutable_cf_options, - const FileOptions* file_options, std::string db_id, + const FileOptions* file_options, + const WriteOptions* write_options, std::string db_id, std::string db_session_id, int job_id, uint32_t column_family_id, const std::string& column_family_name, - Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, const std::shared_ptr& io_tracer, BlobFileCompletionCallback* blob_callback, @@ -51,11 +52,11 @@ class BlobFileBuilder { BlobFileBuilder(std::function file_number_generator, FileSystem* fs, const ImmutableOptions* immutable_options, const MutableCFOptions* mutable_cf_options, - const FileOptions* file_options, std::string db_id, + const FileOptions* file_options, + const WriteOptions* write_options, std::string db_id, std::string db_session_id, int job_id, uint32_t column_family_id, const std::string& column_family_name, - Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, const std::shared_ptr& io_tracer, BlobFileCompletionCallback* blob_callback, @@ -92,12 +93,12 @@ class BlobFileBuilder { CompressionType blob_compression_type_; PrepopulateBlobCache prepopulate_blob_cache_; const FileOptions* file_options_; + const WriteOptions* write_options_; const std::string db_id_; const std::string db_session_id_; int job_id_; uint32_t column_family_id_; std::string column_family_name_; - Env::IOPriority io_priority_; Env::WriteLifeTimeHint write_hint_; std::shared_ptr io_tracer_; BlobFileCompletionCallback* blob_callback_; diff --git a/db/blob/blob_file_builder_test.cc b/db/blob/blob_file_builder_test.cc index 5882e219fe4..8a2ecff13a7 100644 --- a/db/blob/blob_file_builder_test.cc +++ b/db/blob/blob_file_builder_test.cc @@ -43,6 +43,7 @@ class BlobFileBuilderTest : public testing::Test { mock_env_.reset(MockEnv::Create(Env::Default())); fs_ = mock_env_->GetFileSystem().get(); clock_ = mock_env_->GetSystemClock().get(); + write_options_.rate_limiter_priority = Env::IO_HIGH; } void VerifyBlobFile(uint64_t blob_file_number, @@ -113,6 +114,7 @@ class BlobFileBuilderTest : public testing::Test { FileSystem* fs_; SystemClock* clock_; FileOptions file_options_; + WriteOptions write_options_; }; TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) { @@ -136,7 +138,6 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -144,8 +145,8 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); @@ -221,7 +222,6 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -229,8 +229,8 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); @@ -309,7 +309,6 @@ TEST_F(BlobFileBuilderTest, InlinedValues) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -317,8 +316,8 @@ TEST_F(BlobFileBuilderTest, InlinedValues) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); @@ -364,7 +363,6 @@ TEST_F(BlobFileBuilderTest, Compression) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -372,8 +370,8 @@ TEST_F(BlobFileBuilderTest, Compression) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); @@ -448,7 +446,6 @@ TEST_F(BlobFileBuilderTest, CompressionError) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -456,8 +453,8 @@ TEST_F(BlobFileBuilderTest, CompressionError) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); @@ -528,7 +525,6 @@ TEST_F(BlobFileBuilderTest, Checksum) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -536,8 +532,8 @@ TEST_F(BlobFileBuilderTest, Checksum) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); @@ -589,11 +585,13 @@ class BlobFileBuilderIOErrorTest BlobFileBuilderIOErrorTest() : sync_point_(GetParam()) { mock_env_.reset(MockEnv::Create(Env::Default())); fs_ = mock_env_->GetFileSystem().get(); + write_options_.rate_limiter_priority = Env::IO_HIGH; } std::unique_ptr mock_env_; FileSystem* fs_; FileOptions file_options_; + WriteOptions write_options_; std::string sync_point_; }; @@ -626,7 +624,6 @@ TEST_P(BlobFileBuilderIOErrorTest, IOError) { constexpr int job_id = 1; constexpr uint32_t column_family_id = 123; constexpr char column_family_name[] = "foobar"; - constexpr Env::IOPriority io_priority = Env::IO_HIGH; constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; std::vector blob_file_paths; @@ -634,8 +631,8 @@ TEST_P(BlobFileBuilderIOErrorTest, IOError) { BlobFileBuilder builder( TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, - &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, - column_family_id, column_family_name, io_priority, write_hint, + &file_options_, &write_options_, "" /*db_id*/, "" /*db_session_id*/, + job_id, column_family_id, column_family_name, write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); diff --git a/db/blob/blob_file_cache_test.cc b/db/blob/blob_file_cache_test.cc index 8c3c56de9b4..edfeb7e810e 100644 --- a/db/blob/blob_file_cache_test.cc +++ b/db/blob/blob_file_cache_test.cc @@ -57,7 +57,7 @@ void WriteBlobFile(uint32_t column_family_id, BlobLogHeader header(column_family_id, kNoCompression, has_ttl, expiration_range); - ASSERT_OK(blob_log_writer.WriteHeader(header)); + ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header)); constexpr char key[] = "key"; constexpr char blob[] = "blob"; @@ -67,7 +67,8 @@ void WriteBlobFile(uint32_t column_family_id, uint64_t key_offset = 0; uint64_t blob_offset = 0; - ASSERT_OK(blob_log_writer.AddRecord(key, blob, &key_offset, &blob_offset)); + ASSERT_OK(blob_log_writer.AddRecord(WriteOptions(), key, blob, &key_offset, + &blob_offset)); BlobLogFooter footer; footer.blob_count = 1; @@ -76,8 +77,8 @@ void WriteBlobFile(uint32_t column_family_id, std::string checksum_method; std::string checksum_value; - ASSERT_OK( - blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); + ASSERT_OK(blob_log_writer.AppendFooter(WriteOptions(), footer, + &checksum_method, &checksum_value)); } } // anonymous namespace diff --git a/db/blob/blob_file_reader_test.cc b/db/blob/blob_file_reader_test.cc index b6049d1ef5f..676cbed41e8 100644 --- a/db/blob/blob_file_reader_test.cc +++ b/db/blob/blob_file_reader_test.cc @@ -63,7 +63,7 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, BlobLogHeader header(column_family_id, compression, has_ttl, expiration_range_header); - ASSERT_OK(blob_log_writer.WriteHeader(header)); + ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header)); std::vector compressed_blobs(num); std::vector blobs_to_write(num); @@ -91,7 +91,8 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, for (size_t i = 0; i < num; ++i) { uint64_t key_offset = 0; - ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset, + ASSERT_OK(blob_log_writer.AddRecord(WriteOptions(), keys[i], + blobs_to_write[i], &key_offset, &blob_offsets[i])); } @@ -101,8 +102,8 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, std::string checksum_method; std::string checksum_value; - ASSERT_OK( - blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); + ASSERT_OK(blob_log_writer.AppendFooter(WriteOptions(), footer, + &checksum_method, &checksum_value)); } // Creates a test blob file with a single blob in it. Note: this method @@ -404,7 +405,7 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) { requests_buf[0] = BlobReadRequest(key_refs[0], blob_offsets[0], blob_sizes[0], - kNoCompression, nullptr, &statuses_buf[0]); + kNoCompression, nullptr, statuses_buf.data()); requests_buf[1] = BlobReadRequest(key_refs[1], blob_offsets[1], blob_sizes[1] + 1, kNoCompression, nullptr, &statuses_buf[1]); @@ -473,7 +474,7 @@ TEST_F(BlobFileReaderTest, Malformed) { BlobLogHeader header(column_family_id, kNoCompression, has_ttl, expiration_range); - ASSERT_OK(blob_log_writer.WriteHeader(header)); + ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header)); } constexpr HistogramImpl* blob_file_read_hist = nullptr; diff --git a/db/blob/blob_log_writer.cc b/db/blob/blob_log_writer.cc index bf5ef27c1d6..d1768f90209 100644 --- a/db/blob/blob_log_writer.cc +++ b/db/blob/blob_log_writer.cc @@ -33,35 +33,49 @@ BlobLogWriter::BlobLogWriter(std::unique_ptr&& dest, BlobLogWriter::~BlobLogWriter() = default; -Status BlobLogWriter::Sync() { +Status BlobLogWriter::Sync(const WriteOptions& write_options) { TEST_SYNC_POINT("BlobLogWriter::Sync"); StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS); - Status s = dest_->Sync(use_fsync_); - RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED); + IOOptions opts; + Status s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + s = dest_->Sync(opts, use_fsync_); + } + if (s.ok()) { + RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED); + } return s; } -Status BlobLogWriter::WriteHeader(BlobLogHeader& header) { +Status BlobLogWriter::WriteHeader(const WriteOptions& write_options, + BlobLogHeader& header) { assert(block_offset_ == 0); assert(last_elem_type_ == kEtNone); std::string str; header.EncodeTo(&str); - Status s = dest_->Append(Slice(str)); + IOOptions opts; + Status s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + s = dest_->Append(opts, Slice(str)); + } if (s.ok()) { block_offset_ += str.size(); if (do_flush_) { - s = dest_->Flush(); + s = dest_->Flush(opts); } } last_elem_type_ = kEtFileHdr; - RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, - BlobLogHeader::kSize); + if (s.ok()) { + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogHeader::kSize); + } return s; } -Status BlobLogWriter::AppendFooter(BlobLogFooter& footer, +Status BlobLogWriter::AppendFooter(const WriteOptions& write_options, + BlobLogFooter& footer, std::string* checksum_method, std::string* checksum_value) { assert(block_offset_ != 0); @@ -75,14 +89,17 @@ Status BlobLogWriter::AppendFooter(BlobLogFooter& footer, s.PermitUncheckedError(); return Status::IOError("Seen Error. Skip closing."); } else { - s = dest_->Append(Slice(str)); + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + s = dest_->Append(opts, Slice(str)); + } if (s.ok()) { block_offset_ += str.size(); - - s = Sync(); + s = Sync(write_options); if (s.ok()) { - s = dest_->Close(); + s = dest_->Close(opts); if (s.ok()) { assert(!!checksum_method == !!checksum_value); @@ -111,12 +128,15 @@ Status BlobLogWriter::AppendFooter(BlobLogFooter& footer, } last_elem_type_ = kEtFileFooter; - RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, - BlobLogFooter::kSize); + if (s.ok()) { + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogFooter::kSize); + } return s; } -Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, +Status BlobLogWriter::AddRecord(const WriteOptions& write_options, + const Slice& key, const Slice& val, uint64_t expiration, uint64_t* key_offset, uint64_t* blob_offset) { assert(block_offset_ != 0); @@ -125,11 +145,13 @@ Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, std::string buf; ConstructBlobHeader(&buf, key, val, expiration); - Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + Status s = + EmitPhysicalRecord(write_options, buf, key, val, key_offset, blob_offset); return s; } -Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, +Status BlobLogWriter::AddRecord(const WriteOptions& write_options, + const Slice& key, const Slice& val, uint64_t* key_offset, uint64_t* blob_offset) { assert(block_offset_ != 0); assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); @@ -137,7 +159,8 @@ Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, std::string buf; ConstructBlobHeader(&buf, key, val, 0); - Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + Status s = + EmitPhysicalRecord(write_options, buf, key, val, key_offset, blob_offset); return s; } @@ -150,28 +173,34 @@ void BlobLogWriter::ConstructBlobHeader(std::string* buf, const Slice& key, record.EncodeHeaderTo(buf); } -Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf, +Status BlobLogWriter::EmitPhysicalRecord(const WriteOptions& write_options, + const std::string& headerbuf, const Slice& key, const Slice& val, uint64_t* key_offset, uint64_t* blob_offset) { - StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS); - Status s = dest_->Append(Slice(headerbuf)); + IOOptions opts; + Status s = WritableFileWriter::PrepareIOOptions(write_options, opts); if (s.ok()) { - s = dest_->Append(key); + s = dest_->Append(opts, Slice(headerbuf)); } if (s.ok()) { - s = dest_->Append(val); + s = dest_->Append(opts, key); + } + if (s.ok()) { + s = dest_->Append(opts, val); } if (do_flush_ && s.ok()) { - s = dest_->Flush(); + s = dest_->Flush(opts); } *key_offset = block_offset_ + BlobLogRecord::kHeaderSize; *blob_offset = *key_offset + key.size(); block_offset_ = *blob_offset + val.size(); last_elem_type_ = kEtRecord; - RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, - BlobLogRecord::kHeaderSize + key.size() + val.size()); + if (s.ok()) { + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogRecord::kHeaderSize + key.size() + val.size()); + } return s; } diff --git a/db/blob/blob_log_writer.h b/db/blob/blob_log_writer.h index c1f9f31ad00..0ba4f9c2a2e 100644 --- a/db/blob/blob_log_writer.h +++ b/db/blob/blob_log_writer.h @@ -43,20 +43,24 @@ class BlobLogWriter { static void ConstructBlobHeader(std::string* buf, const Slice& key, const Slice& val, uint64_t expiration); - Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset, + Status AddRecord(const WriteOptions& write_options, const Slice& key, + const Slice& val, uint64_t* key_offset, uint64_t* blob_offset); - Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration, - uint64_t* key_offset, uint64_t* blob_offset); + Status AddRecord(const WriteOptions& write_options, const Slice& key, + const Slice& val, uint64_t expiration, uint64_t* key_offset, + uint64_t* blob_offset); - Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key, + Status EmitPhysicalRecord(const WriteOptions& write_options, + const std::string& headerbuf, const Slice& key, const Slice& val, uint64_t* key_offset, uint64_t* blob_offset); - Status AppendFooter(BlobLogFooter& footer, std::string* checksum_method, + Status AppendFooter(const WriteOptions& write_options, BlobLogFooter& footer, + std::string* checksum_method, std::string* checksum_value); - Status WriteHeader(BlobLogHeader& header); + Status WriteHeader(const WriteOptions& write_options, BlobLogHeader& header); WritableFileWriter* file() { return dest_.get(); } @@ -64,7 +68,7 @@ class BlobLogWriter { uint64_t get_log_number() const { return log_number_; } - Status Sync(); + Status Sync(const WriteOptions& write_options); private: std::unique_ptr dest_; diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc index c0e1aba6ec0..a12c210fc28 100644 --- a/db/blob/blob_source_test.cc +++ b/db/blob/blob_source_test.cc @@ -65,7 +65,7 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, BlobLogHeader header(column_family_id, compression, has_ttl, expiration_range_header); - ASSERT_OK(blob_log_writer.WriteHeader(header)); + ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header)); std::vector compressed_blobs(num); std::vector blobs_to_write(num); @@ -93,7 +93,8 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, for (size_t i = 0; i < num; ++i) { uint64_t key_offset = 0; - ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset, + ASSERT_OK(blob_log_writer.AddRecord(WriteOptions(), keys[i], + blobs_to_write[i], &key_offset, &blob_offsets[i])); } @@ -103,8 +104,8 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, std::string checksum_method; std::string checksum_value; - ASSERT_OK( - blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); + ASSERT_OK(blob_log_writer.AppendFooter(WriteOptions(), footer, + &checksum_method, &checksum_value)); } } // anonymous namespace @@ -167,8 +168,8 @@ TEST_F(BlobSourceTest, GetBlobsFromCache) { uint64_t file_size = BlobLogHeader::kSize; for (size_t i = 0; i < num_blobs; ++i) { - keys.push_back({key_strs[i]}); - blobs.push_back({blob_strs[i]}); + keys.emplace_back(key_strs[i]); + blobs.emplace_back(blob_strs[i]); file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size(); } file_size += BlobLogFooter::kSize; @@ -481,8 +482,8 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) { std::vector blobs; for (size_t i = 0; i < num_blobs; ++i) { - keys.push_back({key_strs[i]}); - blobs.push_back({blob_strs[i]}); + keys.emplace_back(key_strs[i]); + blobs.emplace_back(blob_strs[i]); } std::vector blob_offsets(keys.size()); @@ -609,8 +610,8 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) { uint64_t file_size = BlobLogHeader::kSize; uint64_t blob_value_bytes = 0; for (size_t i = 0; i < num_blobs; ++i) { - keys.push_back({key_strs[i]}); - blobs.push_back({blob_strs[i]}); + keys.emplace_back(key_strs[i]); + blobs.emplace_back(blob_strs[i]); blob_value_bytes += blobs[i].size(); file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size(); } @@ -801,8 +802,8 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromCache) { uint64_t file_size = BlobLogHeader::kSize; for (size_t i = 0; i < num_blobs; ++i) { - keys.push_back({key_strs[i]}); - blobs.push_back({blob_strs[i]}); + keys.emplace_back(key_strs[i]); + blobs.emplace_back(blob_strs[i]); file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size(); } file_size += BlobLogFooter::kSize; @@ -1163,7 +1164,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number, blob_offsets[0], file_size, blob_sizes[0], kNoCompression, nullptr /* prefetch_buffer */, - &values[0], nullptr /* bytes_read */)); + values.data(), nullptr /* bytes_read */)); // Release cache handle values[0].Reset(); @@ -1182,7 +1183,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number, blob_offsets[0], file_size, blob_sizes[0], kNoCompression, nullptr /* prefetch_buffer */, - &values[0], nullptr /* bytes_read */)); + values.data(), nullptr /* bytes_read */)); ASSERT_EQ(values[0], blobs[0]); ASSERT_TRUE( blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[0])); @@ -1220,7 +1221,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { auto sec_handle0 = secondary_cache->Lookup( key0, BlobSource::SharedCacheInterface::GetFullHelper(), /*context*/ nullptr, true, - /*advise_erase=*/true, kept_in_sec_cache); + /*advise_erase=*/true, /*stats=*/nullptr, kept_in_sec_cache); ASSERT_FALSE(kept_in_sec_cache); ASSERT_NE(sec_handle0, nullptr); ASSERT_TRUE(sec_handle0->IsReady()); @@ -1248,7 +1249,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { auto sec_handle1 = secondary_cache->Lookup( key1, BlobSource::SharedCacheInterface::GetFullHelper(), /*context*/ nullptr, true, - /*advise_erase=*/true, kept_in_sec_cache); + /*advise_erase=*/true, /*stats=*/nullptr, kept_in_sec_cache); ASSERT_FALSE(kept_in_sec_cache); ASSERT_EQ(sec_handle1, nullptr); @@ -1262,7 +1263,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { ASSERT_OK(blob_source.GetBlob( read_options, keys[0], file_number, blob_offsets[0], file_size, blob_sizes[0], kNoCompression, nullptr /* prefetch_buffer */, - &values[0], nullptr /* bytes_read */)); + values.data(), nullptr /* bytes_read */)); ASSERT_EQ(values[0], blobs[0]); // Release cache handle @@ -1364,8 +1365,8 @@ class BlobSourceCacheReservationTest : public DBTestBase { blob_file_size_ = BlobLogHeader::kSize; for (size_t i = 0; i < kNumBlobs; ++i) { - keys_.push_back({key_strs_[i]}); - blobs_.push_back({blob_strs_[i]}); + keys_.emplace_back(key_strs_[i]); + blobs_.emplace_back(blob_strs_[i]); blob_file_size_ += BlobLogRecord::kHeaderSize + keys_[i].size() + blobs_[i].size(); } diff --git a/db/blob/db_blob_basic_test.cc b/db/blob/db_blob_basic_test.cc index 1c0caba93d9..e41933d3549 100644 --- a/db/blob/db_blob_basic_test.cc +++ b/db/blob/db_blob_basic_test.cc @@ -418,8 +418,8 @@ TEST_F(DBBlobBasicTest, MultiGetBlobs) { std::array values; std::array statuses; - db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(values[0], first_value); @@ -441,8 +441,8 @@ TEST_F(DBBlobBasicTest, MultiGetBlobs) { std::array values; std::array statuses; - db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(values[0], first_value); @@ -512,8 +512,8 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) { std::array values; std::array statuses; - db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(values[0], first_value); @@ -534,8 +534,8 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) { std::array values; std::array statuses; - db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(values[0], first_value); @@ -553,8 +553,8 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) { std::array values; std::array statuses; - db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(values[0], first_value); @@ -574,8 +574,8 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) { std::array values; std::array statuses; - db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(values[0], first_value); @@ -758,8 +758,8 @@ TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) { // // [offset=0, len=12288] - db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -829,8 +829,8 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) { { std::array values; std::array statuses; - db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, + keys.data(), values.data(), statuses.data()); for (size_t i = 0; i < kNumKeys; ++i) { ASSERT_OK(statuses[i]); @@ -843,8 +843,8 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) { { std::array values; std::array statuses; - db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, + keys.data(), values.data(), statuses.data()); for (size_t i = 0; i < kNumKeys; ++i) { ASSERT_TRUE(statuses[i].IsIncomplete()); @@ -858,8 +858,8 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) { { std::array values; std::array statuses; - db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, + keys.data(), values.data(), statuses.data()); for (size_t i = 0; i < kNumKeys; ++i) { ASSERT_OK(statuses[i]); @@ -872,8 +872,8 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) { { std::array values; std::array statuses; - db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, + keys.data(), values.data(), statuses.data()); for (size_t i = 0; i < kNumKeys; ++i) { ASSERT_OK(statuses[i]); @@ -1182,6 +1182,30 @@ TEST_F(DBBlobBasicTest, GetMergeBlobWithPut) { ASSERT_EQ(Get("Key1"), "v1,v2,v3"); } +TEST_F(DBBlobBasicTest, GetMergeBlobFromMemoryTier) { + Options options = GetDefaultOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + ASSERT_OK(Put(Key(0), "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge(Key(0), "v2")); + ASSERT_OK(Flush()); + + // Regular `Get()` loads data block to cache. + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), Key(0), &value)); + ASSERT_EQ("v1,v2", value); + + // Base value blob is still uncached, so an in-memory read will fail. + ReadOptions read_options; + read_options.read_tier = kBlockCacheTier; + ASSERT_TRUE(db_->Get(read_options, Key(0), &value).IsIncomplete()); +} + TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) { constexpr size_t num_keys = 3; @@ -1206,8 +1230,8 @@ TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) { std::array values; std::array statuses; - db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(values[0], "v0_0,v0_1,v0_2"); @@ -1470,8 +1494,8 @@ TEST_P(DBBlobBasicIOErrorMultiGetTest, MultiGetBlobs_IOError) { }); SyncPoint::GetInstance()->EnableProcessing(); - db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -1820,7 +1844,7 @@ TEST_F(DBBlobBasicTest, GetEntityBlob) { std::array statuses; db_->MultiGetEntity(ReadOptions(), db_->DefaultColumnFamily(), num_keys, - &keys[0], &results[0], &statuses[0]); + keys.data(), results.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(results[0].columns(), expected_columns); @@ -1917,8 +1941,8 @@ TEST_F(DBBlobWithTimestampTest, MultiGetBlobs) { std::array values; std::array statuses; - db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(values[0], first_value); @@ -2001,8 +2025,8 @@ TEST_F(DBBlobWithTimestampTest, MultiGetMergeBlobWithPut) { std::array values; std::array statuses; - db_->MultiGet(read_opts, db_->DefaultColumnFamily(), num_keys, &keys[0], - &values[0], &statuses[0]); + db_->MultiGet(read_opts, db_->DefaultColumnFamily(), num_keys, keys.data(), + values.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(values[0], "v0_0,v0_1,v0_2"); diff --git a/db/blob/db_blob_index_test.cc b/db/blob/db_blob_index_test.cc index e2997603490..1edaedb8d6e 100644 --- a/db/blob/db_blob_index_test.cc +++ b/db/blob/db_blob_index_test.cc @@ -45,10 +45,10 @@ class DBBlobIndexTest : public DBTestBase { DBBlobIndexTest() : DBTestBase("db_blob_index_test", /*env_do_fsync=*/true) {} ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); } - - ColumnFamilyData* cfd() { - return static_cast_with_check(cfh())->cfd(); + ColumnFamilyHandleImpl* cfh_impl() { + return static_cast_with_check(cfh()); } + ColumnFamilyData* cfd() { return cfh_impl()->cfd(); } Status PutBlobIndex(WriteBatch* batch, const Slice& key, const Slice& blob_index) { @@ -96,11 +96,9 @@ class DBBlobIndexTest : public DBTestBase { } ArenaWrappedDBIter* GetBlobIterator() { - ColumnFamilyData* column_family = cfd(); DBImpl* db_impl = dbfull(); return db_impl->NewIteratorImpl( - ReadOptions(), column_family, - column_family->GetReferencedSuperVersion(db_impl), + ReadOptions(), cfh_impl(), cfd()->GetReferencedSuperVersion(db_impl), db_impl->GetLatestSequenceNumber(), nullptr /*read_callback*/, true /*expose_blob_index*/); } @@ -325,8 +323,7 @@ TEST_F(DBBlobIndexTest, Iterate) { auto check_is_blob = [&](bool is_blob) { return [is_blob](Iterator* iterator) { - ASSERT_EQ(is_blob, - reinterpret_cast(iterator)->IsBlob()); + ASSERT_EQ(is_blob, static_cast(iterator)->IsBlob()); }; }; diff --git a/db/blob/prefetch_buffer_collection.cc b/db/blob/prefetch_buffer_collection.cc index 079576f518b..908b7b82b56 100644 --- a/db/blob/prefetch_buffer_collection.cc +++ b/db/blob/prefetch_buffer_collection.cc @@ -11,8 +11,10 @@ FilePrefetchBuffer* PrefetchBufferCollection::GetOrCreatePrefetchBuffer( uint64_t file_number) { auto& prefetch_buffer = prefetch_buffers_[file_number]; if (!prefetch_buffer) { - prefetch_buffer.reset( - new FilePrefetchBuffer(readahead_size_, readahead_size_)); + ReadaheadParams readahead_params; + readahead_params.initial_readahead_size = readahead_size_; + readahead_params.max_readahead_size = readahead_size_; + prefetch_buffer.reset(new FilePrefetchBuffer(readahead_params)); } return prefetch_buffer.get(); diff --git a/db/builder.cc b/db/builder.cc index d3040ee9e23..a3c15ad11e5 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -32,9 +32,11 @@ #include "options/options_helper.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "rocksdb/file_system.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/table.h" +#include "seqno_to_time_mapping.h" #include "table/block_based/block_based_table_builder.h" #include "table/format.h" #include "table/internal_iterator.h" @@ -57,8 +59,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, Status BuildTable( const std::string& dbname, VersionSet* versions, const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, - const FileOptions& file_options, const ReadOptions& read_options, - TableCache* table_cache, InternalIterator* iter, + const FileOptions& file_options, TableCache* table_cache, + InternalIterator* iter, std::vector> range_del_iters, FileMetaData* meta, std::vector* blob_file_additions, @@ -68,10 +70,9 @@ Status BuildTable( bool paranoid_file_checks, InternalStats* internal_stats, IOStatus* io_status, const std::shared_ptr& io_tracer, BlobFileCreationReason blob_creation_reason, - const SeqnoToTimeMapping& seqno_to_time_mapping, EventLogger* event_logger, - int job_id, const Env::IOPriority io_priority, - TableProperties* table_properties, Env::WriteLifeTimeHint write_hint, - const std::string* full_history_ts_low, + UnownedPtr seqno_to_time_mapping, + EventLogger* event_logger, int job_id, TableProperties* table_properties, + Env::WriteLifeTimeHint write_hint, const std::string* full_history_ts_low, BlobFileCompletionCallback* blob_callback, Version* version, uint64_t* num_input_entries, uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) { @@ -82,11 +83,8 @@ Status BuildTable( auto& ioptions = tboptions.ioptions; // Reports the IOStats for flush for every following bytes. const size_t kReportFlushIOStatsEvery = 1048576; - OutputValidator output_validator( - tboptions.internal_comparator, - /*enable_order_check=*/ - mutable_cf_options.check_flush_compaction_key_order, - /*enable_hash=*/paranoid_file_checks); + OutputValidator output_validator(tboptions.internal_comparator, + /*enable_hash=*/paranoid_file_checks); Status s; meta->fd.file_size = 0; iter->SeekToFirst(); @@ -164,11 +162,11 @@ Status BuildTable( table_file_created = true; FileTypeSet tmp_set = ioptions.checksum_handoff_file_types; - file->SetIOPriority(io_priority); + file->SetIOPriority(tboptions.write_options.rate_limiter_priority); file->SetWriteLifeTimeHint(write_hint); file_writer.reset(new WritableFileWriter( std::move(file), fname, file_options, ioptions.clock, io_tracer, - ioptions.stats, ioptions.listeners, + ioptions.stats, Histograms::SST_WRITE_MICROS, ioptions.listeners, ioptions.file_checksum_gen_factory.get(), tmp_set.Contains(FileType::kTableFile), false)); @@ -188,10 +186,11 @@ Status BuildTable( blob_file_additions) ? new BlobFileBuilder( versions, fs, &ioptions, &mutable_cf_options, &file_options, - tboptions.db_id, tboptions.db_session_id, job_id, - tboptions.column_family_id, tboptions.column_family_name, - io_priority, write_hint, io_tracer, blob_callback, - blob_creation_reason, &blob_file_paths, blob_file_additions) + &(tboptions.write_options), tboptions.db_id, + tboptions.db_session_id, job_id, tboptions.column_family_id, + tboptions.column_family_name, write_hint, io_tracer, + blob_callback, blob_creation_reason, &blob_file_paths, + blob_file_additions) : nullptr); const std::atomic kManualCompactionCanceledFalse{false}; @@ -208,43 +207,71 @@ Status BuildTable( /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low); const size_t ts_sz = ucmp->timestamp_size(); - const bool strip_timestamp = + const bool logical_strip_timestamp = ts_sz > 0 && !ioptions.persist_user_defined_timestamps; std::string key_after_flush_buf; + std::string value_buf; c_iter.SeekToFirst(); for (; c_iter.Valid(); c_iter.Next()) { const Slice& key = c_iter.key(); const Slice& value = c_iter.value(); - const ParsedInternalKey& ikey = c_iter.ikey(); - Slice key_after_flush = key; + ParsedInternalKey ikey = c_iter.ikey(); + key_after_flush_buf.assign(key.data(), key.size()); + Slice key_after_flush = key_after_flush_buf; + Slice value_after_flush = value; + // If user defined timestamps will be stripped from user key after flush, // the in memory version of the key act logically the same as one with a // minimum timestamp. We update the timestamp here so file boundary and // output validator, block builder all see the effect of the stripping. - if (strip_timestamp) { + if (logical_strip_timestamp) { key_after_flush_buf.clear(); ReplaceInternalKeyWithMinTimestamp(&key_after_flush_buf, key, ts_sz); key_after_flush = key_after_flush_buf; } + if (ikey.type == kTypeValuePreferredSeqno) { + auto [unpacked_value, unix_write_time] = + ParsePackedValueWithWriteTime(value); + SequenceNumber preferred_seqno = + seqno_to_time_mapping + ? seqno_to_time_mapping->GetProximalSeqnoBeforeTime( + unix_write_time) + : kMaxSequenceNumber; + if (preferred_seqno < ikey.sequence) { + value_after_flush = + PackValueAndSeqno(unpacked_value, preferred_seqno, &value_buf); + } else { + // Cannot get a useful preferred seqno, convert it to a kTypeValue. + UpdateInternalKey(&key_after_flush_buf, ikey.sequence, kTypeValue); + ikey = ParsedInternalKey(ikey.user_key, ikey.sequence, kTypeValue); + key_after_flush = key_after_flush_buf; + value_after_flush = ParsePackedValueForValue(value); + } + } + // Generate a rolling 64-bit hash of the key and values // Note : // Here "key" integrates 'sequence_number'+'kType'+'user key'. - s = output_validator.Add(key_after_flush, value); + s = output_validator.Add(key_after_flush, value_after_flush); if (!s.ok()) { break; } - builder->Add(key_after_flush, value); + builder->Add(key_after_flush, value_after_flush); - s = meta->UpdateBoundaries(key_after_flush, value, ikey.sequence, - ikey.type); + s = meta->UpdateBoundaries(key_after_flush, value_after_flush, + ikey.sequence, ikey.type); if (!s.ok()) { break; } // TODO(noetzli): Update stats after flush, too. - if (io_priority == Env::IO_HIGH && + // TODO(hx235): Replace `rate_limiter_priority` with `io_activity` for + // flush IO in repair when we have an `Env::IOActivity` enum for it + if ((tboptions.write_options.io_activity == Env::IOActivity::kFlush || + tboptions.write_options.io_activity == Env::IOActivity::kDBOpen || + tboptions.write_options.rate_limiter_priority == Env::IO_HIGH) && IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) { ThreadStatusUtil::SetThreadOperationProperty( ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); @@ -261,9 +288,12 @@ Status BuildTable( Slice last_tombstone_start_user_key{}; for (range_del_it->SeekToFirst(); range_del_it->Valid(); range_del_it->Next()) { - auto tombstone = range_del_it->Tombstone(); - auto kv = tombstone.Serialize(); - // TODO(yuzhangyu): handle range deletion for UDT in memtables only. + // When user timestamp should not be persisted, we logically strip a + // range tombstone's start and end key's timestamp (replace it with min + // timestamp) before passing them along to table builder and to update + // file boundaries. + auto tombstone = range_del_it->Tombstone(logical_strip_timestamp); + std::pair kv = tombstone.Serialize(); builder->Add(kv.first.Encode(), kv.second); InternalKey tombstone_end = tombstone.SerializeEndKey(); meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_, @@ -275,7 +305,7 @@ Status BuildTable( SizeApproximationOptions approx_opts; approx_opts.files_size_error_margin = 0.1; meta->compensated_range_deletion_size += versions->ApproximateSize( - approx_opts, read_options, version, kv.first.Encode(), + approx_opts, tboptions.read_options, version, kv.first.Encode(), tombstone_end.Encode(), 0 /* start_level */, -1 /* end_level */, TableReaderCaller::kFlush); } @@ -294,12 +324,16 @@ Status BuildTable( if (!s.ok() || empty) { builder->Abandon(); } else { - std::string seqno_to_time_mapping_str; - seqno_to_time_mapping.Encode( - seqno_to_time_mapping_str, meta->fd.smallest_seqno, - meta->fd.largest_seqno, meta->file_creation_time); + SeqnoToTimeMapping relevant_mapping; + if (seqno_to_time_mapping) { + relevant_mapping.CopyFromSeqnoRange(*seqno_to_time_mapping, + meta->fd.smallest_seqno, + meta->fd.largest_seqno); + relevant_mapping.SetCapacity(kMaxSeqnoTimePairsPerSST); + relevant_mapping.Enforce(tboptions.file_creation_time); + } builder->SetSeqnoTimeTableProperties( - seqno_to_time_mapping_str, + relevant_mapping, ioptions.compaction_style == CompactionStyle::kCompactionStyleFIFO ? meta->file_creation_time : meta->oldest_ancester_time); @@ -346,13 +380,16 @@ Status BuildTable( // Finish and check for file errors TEST_SYNC_POINT("BuildTable:BeforeSyncTable"); - if (s.ok() && !empty) { + IOOptions opts; + *io_status = + WritableFileWriter::PrepareIOOptions(tboptions.write_options, opts); + if (s.ok() && io_status->ok() && !empty) { StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS); - *io_status = file_writer->Sync(ioptions.use_fsync); + *io_status = file_writer->Sync(opts, ioptions.use_fsync); } TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile"); if (s.ok() && io_status->ok() && !empty) { - *io_status = file_writer->Close(); + *io_status = file_writer->Close(opts); } if (s.ok() && io_status->ok() && !empty) { // Add the checksum information to file metadata. @@ -396,9 +433,9 @@ Status BuildTable( // No matter whether use_direct_io_for_flush_and_compaction is true, // the goal is to cache it here for further user reads. std::unique_ptr it(table_cache->NewIterator( - read_options, file_options, tboptions.internal_comparator, *meta, - nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor, - nullptr, + tboptions.read_options, file_options, tboptions.internal_comparator, + *meta, nullptr /* range_del_agg */, + mutable_cf_options.prefix_extractor, nullptr, (internal_stats == nullptr) ? nullptr : internal_stats->GetFileReadHist(0), TableReaderCaller::kFlush, /*arena=*/nullptr, @@ -411,7 +448,6 @@ Status BuildTable( s = it->status(); if (s.ok() && paranoid_file_checks) { OutputValidator file_validator(tboptions.internal_comparator, - /*enable_order_check=*/true, /*enable_hash=*/true); for (it->SeekToFirst(); it->Valid(); it->Next()) { // Generate a rolling 64-bit hash of the key and values @@ -436,8 +472,13 @@ Status BuildTable( constexpr IODebugContext* dbg = nullptr; if (table_file_created) { - Status ignored = fs->DeleteFile(fname, IOOptions(), dbg); - ignored.PermitUncheckedError(); + IOOptions opts; + Status prepare = + WritableFileWriter::PrepareIOOptions(tboptions.write_options, opts); + if (prepare.ok()) { + Status ignored = fs->DeleteFile(fname, opts, dbg); + ignored.PermitUncheckedError(); + } } assert(blob_file_additions || blob_file_paths.empty()); diff --git a/db/builder.h b/db/builder.h index 6a6a1866a13..f228f8d0fe7 100644 --- a/db/builder.h +++ b/db/builder.h @@ -23,7 +23,6 @@ #include "rocksdb/status.h" #include "rocksdb/table_properties.h" #include "rocksdb/types.h" -#include "table/scoped_arena_iterator.h" namespace ROCKSDB_NAMESPACE { @@ -50,11 +49,11 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, // // @param column_family_name Name of the column family that is also identified // by column_family_id, or empty string if unknown. -extern Status BuildTable( +Status BuildTable( const std::string& dbname, VersionSet* versions, const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, - const FileOptions& file_options, const ReadOptions& read_options, - TableCache* table_cache, InternalIterator* iter, + const FileOptions& file_options, TableCache* table_cache, + InternalIterator* iter, std::vector> range_del_iters, FileMetaData* meta, std::vector* blob_file_additions, @@ -64,9 +63,8 @@ extern Status BuildTable( bool paranoid_file_checks, InternalStats* internal_stats, IOStatus* io_status, const std::shared_ptr& io_tracer, BlobFileCreationReason blob_creation_reason, - const SeqnoToTimeMapping& seqno_to_time_mapping, + UnownedPtr seqno_to_time_mapping, EventLogger* event_logger = nullptr, int job_id = 0, - const Env::IOPriority io_priority = Env::IO_HIGH, TableProperties* table_properties = nullptr, Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET, const std::string* full_history_ts_low = nullptr, diff --git a/db/c.cc b/db/c.cc index 5555ae19875..69cd665c0b6 100644 --- a/db/c.cc +++ b/db/c.cc @@ -46,6 +46,7 @@ #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/write_batch.h" #include "rocksdb/write_buffer_manager.h" +#include "util/stderr_logger.h" #include "utilities/merge_operators.h" using ROCKSDB_NAMESPACE::BackupEngine; @@ -115,6 +116,7 @@ using ROCKSDB_NAMESPACE::Snapshot; using ROCKSDB_NAMESPACE::SstFileMetaData; using ROCKSDB_NAMESPACE::SstFileWriter; using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::StderrLogger; using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory; using ROCKSDB_NAMESPACE::Transaction; using ROCKSDB_NAMESPACE::TransactionDB; @@ -444,7 +446,7 @@ struct rocksdb_mergeoperator_t : public MergeOperator { size_t new_value_len; char* tmp_new_value = (*full_merge_)( state_, merge_in.key.data(), merge_in.key.size(), existing_value_data, - existing_value_len, &operand_pointers[0], &operand_sizes[0], + existing_value_len, operand_pointers.data(), operand_sizes.data(), static_cast(n), &success, &new_value_len); merge_out->new_value.assign(tmp_new_value, new_value_len); @@ -473,8 +475,9 @@ struct rocksdb_mergeoperator_t : public MergeOperator { unsigned char success; size_t new_value_len; char* tmp_new_value = (*partial_merge_)( - state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0], - static_cast(operand_count), &success, &new_value_len); + state_, key.data(), key.size(), operand_pointers.data(), + operand_sizes.data(), static_cast(operand_count), &success, + &new_value_len); new_value->assign(tmp_new_value, new_value_len); if (delete_value_ != nullptr) { @@ -884,9 +887,9 @@ rocksdb_t* rocksdb_open_and_trim_history( size_t trim_tslen, char** errptr) { std::vector column_families; for (int i = 0; i < num_column_families; i++) { - column_families.push_back(ColumnFamilyDescriptor( + column_families.emplace_back( std::string(column_family_names[i]), - ColumnFamilyOptions(column_family_options[i]->rep))); + ColumnFamilyOptions(column_family_options[i]->rep)); } std::string trim_ts_(trim_ts, trim_tslen); @@ -917,9 +920,9 @@ rocksdb_t* rocksdb_open_column_families( rocksdb_column_family_handle_t** column_family_handles, char** errptr) { std::vector column_families; for (int i = 0; i < num_column_families; i++) { - column_families.push_back(ColumnFamilyDescriptor( + column_families.emplace_back( std::string(column_family_names[i]), - ColumnFamilyOptions(column_family_options[i]->rep))); + ColumnFamilyOptions(column_family_options[i]->rep)); } DB* db; @@ -951,9 +954,9 @@ rocksdb_t* rocksdb_open_column_families_with_ttl( for (int i = 0; i < num_column_families; i++) { ttls_vec.push_back(ttls[i]); - column_families.push_back(ColumnFamilyDescriptor( + column_families.emplace_back( std::string(column_family_names[i]), - ColumnFamilyOptions(column_family_options[i]->rep))); + ColumnFamilyOptions(column_family_options[i]->rep)); } ROCKSDB_NAMESPACE::DBWithTTL* db; @@ -983,9 +986,9 @@ rocksdb_t* rocksdb_open_for_read_only_column_families( unsigned char error_if_wal_file_exists, char** errptr) { std::vector column_families; for (int i = 0; i < num_column_families; i++) { - column_families.push_back(ColumnFamilyDescriptor( + column_families.emplace_back( std::string(column_family_names[i]), - ColumnFamilyOptions(column_family_options[i]->rep))); + ColumnFamilyOptions(column_family_options[i]->rep)); } DB* db; @@ -1079,7 +1082,7 @@ rocksdb_column_family_handle_t** rocksdb_create_column_families( std::vector handles; std::vector names; for (int i = 0; i != num_column_families; ++i) { - names.push_back(std::string(column_family_names[i])); + names.emplace_back(column_family_names[i]); } SaveError(errptr, db->rep->CreateColumnFamilies( ColumnFamilyOptions(column_family_options->rep), names, @@ -1685,6 +1688,11 @@ void rocksdb_release_snapshot(rocksdb_t* db, delete snapshot; } +uint64_t rocksdb_snapshot_get_sequence_number( + const rocksdb_snapshot_t* snapshot) { + return snapshot->rep->GetSequenceNumber(); +} + char* rocksdb_property_value(rocksdb_t* db, const char* propname) { std::string tmp; if (db->rep->GetProperty(Slice(propname), &tmp)) { @@ -1872,9 +1880,8 @@ void rocksdb_disable_file_deletions(rocksdb_t* db, char** errptr) { SaveError(errptr, db->rep->DisableFileDeletions()); } -void rocksdb_enable_file_deletions(rocksdb_t* db, unsigned char force, - char** errptr) { - SaveError(errptr, db->rep->EnableFileDeletions(force)); +void rocksdb_enable_file_deletions(rocksdb_t* db, char** errptr) { + SaveError(errptr, db->rep->EnableFileDeletions()); } void rocksdb_destroy_db(const rocksdb_options_t* options, const char* name, @@ -2782,7 +2789,9 @@ void rocksdb_options_set_cuckoo_table_factory( void rocksdb_set_options(rocksdb_t* db, int count, const char* const keys[], const char* const values[], char** errptr) { std::unordered_map options_map; - for (int i = 0; i < count; i++) options_map[keys[i]] = values[i]; + for (int i = 0; i < count; i++) { + options_map[keys[i]] = values[i]; + } SaveError(errptr, db->rep->SetOptions(options_map)); } @@ -2791,7 +2800,9 @@ void rocksdb_set_options_cf(rocksdb_t* db, const char* const keys[], const char* const values[], char** errptr) { std::unordered_map options_map; - for (int i = 0; i < count; i++) options_map[keys[i]] = values[i]; + for (int i = 0; i < count; i++) { + options_map[keys[i]] = values[i]; + } SaveError(errptr, db->rep->SetOptions(handle->rep, options_map)); } @@ -2919,6 +2930,23 @@ void rocksdb_options_set_cf_paths(rocksdb_options_t* opt, opt->rep.cf_paths = cf_paths; } +rocksdb_logger_t* rocksdb_logger_create_stderr_logger(int log_level, + const char* prefix) { + rocksdb_logger_t* logger = new rocksdb_logger_t; + + if (prefix) { + logger->rep = std::make_shared( + static_cast(log_level), prefix); + } else { + logger->rep = + std::make_shared(static_cast(log_level)); + } + + return logger; +} + +void rocksdb_logger_destroy(rocksdb_logger_t* logger) { delete logger; } + void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) { opt->rep.env = (env ? env->rep : nullptr); } @@ -2929,6 +2957,12 @@ void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) { } } +rocksdb_logger_t* rocksdb_options_get_info_log(rocksdb_options_t* opt) { + rocksdb_logger_t* info_log = new rocksdb_logger_t; + info_log->rep = opt->rep.info_log; + return info_log; +} + void rocksdb_options_set_info_log_level(rocksdb_options_t* opt, int v) { opt->rep.info_log_level = static_cast(v); } @@ -3049,6 +3083,14 @@ void rocksdb_options_set_max_bytes_for_level_multiplier_additional( } } +void rocksdb_options_set_ttl(rocksdb_options_t* opt, uint64_t seconds) { + opt->rep.ttl = seconds; +} + +uint64_t rocksdb_options_get_ttl(rocksdb_options_t* opt) { + return opt->rep.ttl; +} + void rocksdb_options_set_periodic_compaction_seconds(rocksdb_options_t* opt, uint64_t seconds) { opt->rep.periodic_compaction_seconds = seconds; @@ -3111,7 +3153,7 @@ void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt, unsigned char val) { opt->rep.enable_blob_files = val; } -extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files( +ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files( rocksdb_options_t* opt) { return opt->rep.enable_blob_files; } @@ -3483,35 +3525,6 @@ unsigned char rocksdb_options_get_advise_random_on_open( return opt->rep.advise_random_on_open; } -void rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t* opt, - int v) { - switch (v) { - case 0: - opt->rep.access_hint_on_compaction_start = - ROCKSDB_NAMESPACE::Options::NONE; - break; - case 1: - opt->rep.access_hint_on_compaction_start = - ROCKSDB_NAMESPACE::Options::NORMAL; - break; - case 2: - opt->rep.access_hint_on_compaction_start = - ROCKSDB_NAMESPACE::Options::SEQUENTIAL; - break; - case 3: - opt->rep.access_hint_on_compaction_start = - ROCKSDB_NAMESPACE::Options::WILLNEED; - break; - default: - assert(0); - } -} - -int rocksdb_options_get_access_hint_on_compaction_start( - rocksdb_options_t* opt) { - return opt->rep.access_hint_on_compaction_start; -} - void rocksdb_options_set_use_adaptive_mutex(rocksdb_options_t* opt, unsigned char v) { opt->rep.use_adaptive_mutex = v; @@ -3997,6 +4010,16 @@ rocksdb_ratelimiter_t* rocksdb_ratelimiter_create_auto_tuned( return rate_limiter; } +rocksdb_ratelimiter_t* rocksdb_ratelimiter_create_with_mode( + int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness, + int mode, bool auto_tuned) { + rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t; + rate_limiter->rep.reset( + NewGenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness, + static_cast(mode), auto_tuned)); + return rate_limiter; +} + void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t* limiter) { delete limiter; } @@ -4593,7 +4616,7 @@ void rocksdb_readoptions_set_io_timeout(rocksdb_readoptions_t* opt, opt->rep.io_timeout = std::chrono::microseconds(microseconds); } -extern ROCKSDB_LIBRARY_API uint64_t +ROCKSDB_LIBRARY_API uint64_t rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) { return opt->rep.io_timeout.count(); } @@ -5042,7 +5065,9 @@ void rocksdb_env_lower_high_priority_thread_pool_cpu_priority( } void rocksdb_env_destroy(rocksdb_env_t* env) { - if (!env->is_default) delete env->rep; + if (!env->is_default) { + delete env->rep; + } delete env; } @@ -5426,9 +5451,7 @@ uint64_t rocksdb_livefiles_deletions(const rocksdb_livefiles_t* lf, int index) { return lf->rep[index].num_deletions; } -extern void rocksdb_livefiles_destroy(const rocksdb_livefiles_t* lf) { - delete lf; -} +void rocksdb_livefiles_destroy(const rocksdb_livefiles_t* lf) { delete lf; } void rocksdb_get_options_from_string(const rocksdb_options_t* base_options, const char* opts_str, @@ -5508,7 +5531,7 @@ size_t rocksdb_column_family_metadata_get_level_count( rocksdb_level_metadata_t* rocksdb_column_family_metadata_get_level_metadata( rocksdb_column_family_metadata_t* cf_meta, size_t i) { if (i >= cf_meta->rep.levels.size()) { - return NULL; + return nullptr; } rocksdb_level_metadata_t* level_meta = (rocksdb_level_metadata_t*)malloc(sizeof(rocksdb_level_metadata_t)); @@ -5723,9 +5746,9 @@ rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families( rocksdb_column_family_handle_t** column_family_handles, char** errptr) { std::vector column_families; for (int i = 0; i < num_column_families; i++) { - column_families.push_back(ColumnFamilyDescriptor( + column_families.emplace_back( std::string(column_family_names[i]), - ColumnFamilyOptions(column_family_options[i]->rep))); + ColumnFamilyOptions(column_family_options[i]->rep)); } TransactionDB* txn_db; @@ -6517,9 +6540,9 @@ rocksdb_optimistictransactiondb_open_column_families( rocksdb_column_family_handle_t** column_family_handles, char** errptr) { std::vector column_families; for (int i = 0; i < num_column_families; i++) { - column_families.push_back(ColumnFamilyDescriptor( + column_families.emplace_back( std::string(column_family_names[i]), - ColumnFamilyOptions(column_family_options[i]->rep))); + ColumnFamilyOptions(column_family_options[i]->rep)); } OptimisticTransactionDB* otxn_db; diff --git a/db/c_test.c b/db/c_test.c index 66722049692..656e2fa5f71 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -50,12 +50,13 @@ static void StartPhase(const char* name) { #endif static const char* GetTempDir(void) { const char* ret = getenv("TEST_TMPDIR"); - if (ret == NULL || ret[0] == '\0') + if (ret == NULL || ret[0] == '\0') { #ifdef OS_WIN ret = getenv("TEMP"); #else ret = "/tmp"; #endif + } return ret; } #ifdef _MSC_VER @@ -206,10 +207,11 @@ static int CmpCompare(void* arg, const char* a, size_t alen, const char* b, size_t n = (alen < blen) ? alen : blen; int r = memcmp(a, b, n); if (r == 0) { - if (alen < blen) + if (alen < blen) { r = -1; - else if (alen > blen) + } else if (alen > blen) { r = +1; + } } return r; } @@ -718,6 +720,11 @@ int main(int argc, char** argv) { rocksdb_options_set_ratelimiter(options, rate_limiter); rocksdb_ratelimiter_destroy(rate_limiter); + rate_limiter = rocksdb_ratelimiter_create_with_mode(1000 * 1024 * 1024, + 100 * 1000, 10, 0, true); + rocksdb_options_set_ratelimiter(options, rate_limiter); + rocksdb_ratelimiter_destroy(rate_limiter); + roptions = rocksdb_readoptions_create(); rocksdb_readoptions_set_verify_checksums(roptions, 1); rocksdb_readoptions_set_fill_cache(roptions, 1); @@ -878,9 +885,8 @@ int main(int argc, char** argv) { StartPhase("addfile"); { rocksdb_envoptions_t* env_opt = rocksdb_envoptions_create(); - rocksdb_options_t* io_options = rocksdb_options_create(); rocksdb_sstfilewriter_t* writer = - rocksdb_sstfilewriter_create(env_opt, io_options); + rocksdb_sstfilewriter_create(env_opt, options); remove(sstfilename); rocksdb_sstfilewriter_open(writer, sstfilename, &err); @@ -939,7 +945,6 @@ int main(int argc, char** argv) { rocksdb_ingestexternalfileoptions_destroy(ing_opt); rocksdb_sstfilewriter_destroy(writer); - rocksdb_options_destroy(io_options); rocksdb_envoptions_destroy(env_opt); // Delete all keys we just ingested @@ -1925,6 +1930,9 @@ int main(int argc, char** argv) { CheckCondition(100000 == rocksdb_options_get_periodic_compaction_seconds(o)); + rocksdb_options_set_ttl(o, 5000); + CheckCondition(5000 == rocksdb_options_get_ttl(o)); + rocksdb_options_set_skip_stats_update_on_db_open(o, 1); CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o)); @@ -2031,8 +2039,6 @@ int main(int argc, char** argv) { rocksdb_options_set_advise_random_on_open(o, 1); CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o)); - rocksdb_options_set_access_hint_on_compaction_start(o, 3); - CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o)); rocksdb_options_set_use_adaptive_mutex(o, 1); CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o)); @@ -2228,8 +2234,6 @@ int main(int argc, char** argv) { CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(copy)); CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(copy)); CheckCondition(1 == rocksdb_options_get_advise_random_on_open(copy)); - CheckCondition(3 == - rocksdb_options_get_access_hint_on_compaction_start(copy)); CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(copy)); CheckCondition(19 == rocksdb_options_get_bytes_per_sync(copy)); CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(copy)); @@ -2361,6 +2365,10 @@ int main(int argc, char** argv) { CheckCondition(100000 == rocksdb_options_get_periodic_compaction_seconds(o)); + rocksdb_options_set_ttl(copy, 8000); + CheckCondition(8000 == rocksdb_options_get_ttl(copy)); + CheckCondition(5000 == rocksdb_options_get_ttl(o)); + rocksdb_options_set_skip_stats_update_on_db_open(copy, 0); CheckCondition(0 == rocksdb_options_get_skip_stats_update_on_db_open(copy)); CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o)); @@ -2508,11 +2516,6 @@ int main(int argc, char** argv) { CheckCondition(0 == rocksdb_options_get_advise_random_on_open(copy)); CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o)); - rocksdb_options_set_access_hint_on_compaction_start(copy, 2); - CheckCondition(2 == - rocksdb_options_get_access_hint_on_compaction_start(copy)); - CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o)); - rocksdb_options_set_use_adaptive_mutex(copy, 0); CheckCondition(0 == rocksdb_options_get_use_adaptive_mutex(copy)); CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o)); @@ -2781,6 +2784,30 @@ int main(int argc, char** argv) { rocksdb_memory_allocator_destroy(allocator); } + StartPhase("stderr_logger"); + { + rocksdb_options_t* o_no_prefix = rocksdb_options_create(); + rocksdb_logger_t* no_prefix_logger = + rocksdb_logger_create_stderr_logger(3, NULL); + rocksdb_options_set_info_log(o_no_prefix, no_prefix_logger); + rocksdb_logger_t* no_prefix_info_log = + rocksdb_options_get_info_log(o_no_prefix); + CheckCondition(no_prefix_info_log != NULL); + rocksdb_logger_destroy(no_prefix_logger); + rocksdb_logger_destroy(no_prefix_info_log); + rocksdb_options_destroy(o_no_prefix); + + rocksdb_options_t* o_prefix = rocksdb_options_create(); + rocksdb_logger_t* prefix_logger = + rocksdb_logger_create_stderr_logger(3, "some prefix"); + rocksdb_options_set_info_log(o_prefix, prefix_logger); + rocksdb_logger_t* prefix_info_log = rocksdb_options_get_info_log(o_prefix); + CheckCondition(prefix_info_log != NULL); + rocksdb_logger_destroy(prefix_logger); + rocksdb_logger_destroy(prefix_info_log); + rocksdb_options_destroy(o_prefix); + } + StartPhase("env"); { rocksdb_env_t* e; @@ -3687,7 +3714,7 @@ int main(int argc, char** argv) { StartPhase("statistics"); { - const uint32_t BYTES_WRITTEN_TICKER = 40; + const uint32_t BYTES_WRITTEN_TICKER = 60; const uint32_t DB_WRITE_HIST = 1; rocksdb_statistics_histogram_data_t* hist = diff --git a/db/column_family.cc b/db/column_family.cc index dc74c16d7b3..d16c7b55a20 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -96,16 +96,16 @@ const Comparator* ColumnFamilyHandleImpl::GetComparator() const { return cfd()->user_comparator(); } -void GetIntTblPropCollectorFactory( +void GetInternalTblPropCollFactory( const ImmutableCFOptions& ioptions, - IntTblPropCollectorFactories* int_tbl_prop_collector_factories) { - assert(int_tbl_prop_collector_factories); + InternalTblPropCollFactories* internal_tbl_prop_coll_factories) { + assert(internal_tbl_prop_coll_factories); auto& collector_factories = ioptions.table_properties_collector_factories; for (size_t i = 0; i < ioptions.table_properties_collector_factories.size(); ++i) { assert(collector_factories[i]); - int_tbl_prop_collector_factories->emplace_back( + internal_tbl_prop_coll_factories->emplace_back( new UserKeyTablePropertiesCollectorFactory(collector_factories[i])); } } @@ -322,8 +322,8 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, } ROCKS_LOG_WARN(db_options.logger, "Adjust the value to " - "level0_stop_writes_trigger(%d)" - "level0_slowdown_writes_trigger(%d)" + "level0_stop_writes_trigger(%d) " + "level0_slowdown_writes_trigger(%d) " "level0_file_num_compaction_trigger(%d)", result.level0_stop_writes_trigger, result.level0_slowdown_writes_trigger, @@ -358,16 +358,16 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, if (result.level_compaction_dynamic_level_bytes) { if (result.compaction_style != kCompactionStyleLevel) { - ROCKS_LOG_WARN(db_options.info_log.get(), - "level_compaction_dynamic_level_bytes only makes sense" + ROCKS_LOG_INFO(db_options.info_log.get(), + "level_compaction_dynamic_level_bytes only makes sense " "for level-based compaction"); result.level_compaction_dynamic_level_bytes = false; } else if (result.cf_paths.size() > 1U) { // we don't yet know how to make both of this feature and multiple // DB path work. ROCKS_LOG_WARN(db_options.info_log.get(), - "multiple cf_paths/db_paths and" - "level_compaction_dynamic_level_bytes" + "multiple cf_paths/db_paths and " + "level_compaction_dynamic_level_bytes " "can't be used together"); result.level_compaction_dynamic_level_bytes = false; } @@ -411,6 +411,13 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, "periodic_compaction_seconds does not support FIFO compaction. You" "may want to set option TTL instead."); } + if (result.last_level_temperature != Temperature::kUnknown) { + ROCKS_LOG_WARN( + db_options.info_log.get(), + "last_level_temperature is ignored with FIFO compaction. Consider " + "CompactionOptionsFIFO::file_temperature_age_thresholds."); + result.last_level_temperature = Temperature::kUnknown; + } } // For universal compaction, `ttl` and `periodic_compaction_seconds` mean the @@ -470,13 +477,16 @@ void SuperVersion::Cleanup() { cfd->UnrefAndTryDelete(); } -void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem, - MemTableListVersion* new_imm, Version* new_current) { +void SuperVersion::Init( + ColumnFamilyData* new_cfd, MemTable* new_mem, MemTableListVersion* new_imm, + Version* new_current, + std::shared_ptr new_seqno_to_time_mapping) { cfd = new_cfd; mem = new_mem; imm = new_imm; current = new_current; full_history_ts_low = cfd->GetFullHistoryTsLow(); + seqno_to_time_mapping = std::move(new_seqno_to_time_mapping); cfd->Ref(); mem->Ref(); imm->Ref(); @@ -572,7 +582,7 @@ ColumnFamilyData::ColumnFamilyData( Ref(); // Convert user defined table properties collector factories to internal ones. - GetIntTblPropCollectorFactory(ioptions_, &int_tbl_prop_collector_factories_); + GetInternalTblPropCollFactory(ioptions_, &internal_tbl_prop_coll_factories_); // if _dummy_versions is nullptr, then this is a dummy column family. if (_dummy_versions != nullptr) { @@ -833,8 +843,8 @@ std::unique_ptr SetupDelay( return write_controller->GetDelayToken(write_rate); } -int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger, - int level0_slowdown_writes_trigger) { +int GetL0FileCountForCompactionSpeedup(int level0_file_num_compaction_trigger, + int level0_slowdown_writes_trigger) { // SanitizeOptions() ensures it. assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger); @@ -864,6 +874,43 @@ int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger, return static_cast(res); } } + +uint64_t GetPendingCompactionBytesForCompactionSpeedup( + const MutableCFOptions& mutable_cf_options, + const VersionStorageInfo* vstorage) { + // Compaction debt relatively large compared to the stable (bottommost) data + // size indicates compaction fell behind. + const uint64_t kBottommostSizeDivisor = 8; + // Meaningful progress toward the slowdown trigger is another good indication. + const uint64_t kSlowdownTriggerDivisor = 4; + + uint64_t bottommost_files_size = 0; + for (const auto& level_and_file : vstorage->BottommostFiles()) { + bottommost_files_size += level_and_file.second->fd.GetFileSize(); + } + + // Slowdown trigger might be zero but that means compaction speedup should + // always happen (undocumented/historical), so no special treatment is needed. + uint64_t slowdown_threshold = + mutable_cf_options.soft_pending_compaction_bytes_limit / + kSlowdownTriggerDivisor; + + // Size of zero, however, should not be used to decide to speedup compaction. + if (bottommost_files_size == 0) { + return slowdown_threshold; + } + + uint64_t size_threshold = bottommost_files_size / kBottommostSizeDivisor; + return std::min(size_threshold, slowdown_threshold); +} + +uint64_t GetMarkedFileCountForCompactionSpeedup() { + // When just one file is marked, it is not clear that parallel compaction will + // help the compaction that the user nicely requested to happen sooner. When + // multiple files are marked, however, it is pretty clearly helpful, except + // for the rare case in which a single compaction grabs all the marked files. + return 2; +} } // anonymous namespace std::pair @@ -1041,7 +1088,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( } else { assert(write_stall_condition == WriteStallCondition::kNormal); if (vstorage->l0_delay_trigger_count() >= - GetL0ThresholdSpeedupCompaction( + GetL0FileCountForCompactionSpeedup( mutable_cf_options.level0_file_num_compaction_trigger, mutable_cf_options.level0_slowdown_writes_trigger)) { write_controller_token_ = @@ -1051,22 +1098,32 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( "[%s] Increasing compaction threads because we have %d level-0 " "files ", name_.c_str(), vstorage->l0_delay_trigger_count()); - } else if (vstorage->estimated_compaction_needed_bytes() >= - mutable_cf_options.soft_pending_compaction_bytes_limit / 4) { - // Increase compaction threads if bytes needed for compaction exceeds - // 1/4 of threshold for slowing down. + } else if (mutable_cf_options.soft_pending_compaction_bytes_limit == 0) { // If soft pending compaction byte limit is not set, always speed up // compaction. write_controller_token_ = write_controller->GetCompactionPressureToken(); - if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) { - ROCKS_LOG_INFO( - ioptions_.logger, - "[%s] Increasing compaction threads because of estimated pending " - "compaction " - "bytes %" PRIu64, - name_.c_str(), vstorage->estimated_compaction_needed_bytes()); - } + } else if (vstorage->estimated_compaction_needed_bytes() >= + GetPendingCompactionBytesForCompactionSpeedup( + mutable_cf_options, vstorage)) { + write_controller_token_ = + write_controller->GetCompactionPressureToken(); + ROCKS_LOG_INFO( + ioptions_.logger, + "[%s] Increasing compaction threads because of estimated pending " + "compaction " + "bytes %" PRIu64, + name_.c_str(), vstorage->estimated_compaction_needed_bytes()); + } else if (uint64_t(vstorage->FilesMarkedForCompaction().size()) >= + GetMarkedFileCountForCompactionSpeedup()) { + write_controller_token_ = + write_controller->GetCompactionPressureToken(); + ROCKS_LOG_INFO( + ioptions_.logger, + "[%s] Increasing compaction threads because we have %" PRIu64 + " files marked for compaction", + name_.c_str(), + uint64_t(vstorage->FilesMarkedForCompaction().size())); } else { write_controller_token_.reset(); } @@ -1154,21 +1211,22 @@ bool ColumnFamilyData::RangeOverlapWithCompaction( } Status ColumnFamilyData::RangesOverlapWithMemtables( - const autovector& ranges, SuperVersion* super_version, + const autovector& ranges, SuperVersion* super_version, bool allow_data_in_errors, bool* overlap) { assert(overlap != nullptr); *overlap = false; // Create an InternalIterator over all unflushed memtables Arena arena; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions read_opts; read_opts.total_order_seek = true; MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); - merge_iter_builder.AddIterator( - super_version->mem->NewIterator(read_opts, &arena)); - super_version->imm->AddIterators(read_opts, &merge_iter_builder, + merge_iter_builder.AddIterator(super_version->mem->NewIterator( + read_opts, /*seqno_to_time_mapping=*/nullptr, &arena)); + super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr, + &merge_iter_builder, false /* add_range_tombstone_iter */); - ScopedArenaIterator memtable_iter(merge_iter_builder.Finish()); + ScopedArenaPtr memtable_iter(merge_iter_builder.Finish()); auto read_seq = super_version->current->version_set()->LastSequence(); ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq); @@ -1198,7 +1256,8 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( if (status.ok()) { if (memtable_iter->Valid() && - ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) { + ucmp->CompareWithoutTimestamp(seek_result.user_key, + ranges[i].limit) <= 0) { *overlap = true; } else if (range_del_agg.IsRangeOverlapped(ranges[i].start, ranges[i].limit)) { @@ -1303,7 +1362,12 @@ void ColumnFamilyData::InstallSuperVersion( const MutableCFOptions& mutable_cf_options) { SuperVersion* new_superversion = sv_context->new_superversion.release(); new_superversion->mutable_cf_options = mutable_cf_options; - new_superversion->Init(this, mem_, imm_.current(), current_); + new_superversion->Init(this, mem_, imm_.current(), current_, + sv_context->new_seqno_to_time_mapping + ? std::move(sv_context->new_seqno_to_time_mapping) + : super_version_ + ? super_version_->ShareSeqnoToTimeMapping() + : nullptr); SuperVersion* old_superversion = super_version_; super_version_ = new_superversion; if (old_superversion == nullptr || old_superversion->current != current() || diff --git a/db/column_family.h b/db/column_family.h index 3a78ae875a7..a7b2f97c436 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -26,6 +26,7 @@ #include "rocksdb/env.h" #include "rocksdb/options.h" #include "trace_replay/block_cache_tracer.h" +#include "util/cast_util.h" #include "util/hash_containers.h" #include "util/thread_local.h" @@ -168,11 +169,12 @@ class ColumnFamilyHandleImpl : public ColumnFamilyHandle { // destroy without mutex virtual ~ColumnFamilyHandleImpl(); virtual ColumnFamilyData* cfd() const { return cfd_; } + virtual DBImpl* db() const { return db_; } - virtual uint32_t GetID() const override; - virtual const std::string& GetName() const override; - virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override; - virtual const Comparator* GetComparator() const override; + uint32_t GetID() const override; + const std::string& GetName() const override; + Status GetDescriptor(ColumnFamilyDescriptor* desc) override; + const Comparator* GetComparator() const override; private: ColumnFamilyData* cfd_; @@ -193,7 +195,7 @@ class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { internal_cfd_(nullptr) {} void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; } - virtual ColumnFamilyData* cfd() const override { return internal_cfd_; } + ColumnFamilyData* cfd() const override { return internal_cfd_; } private: ColumnFamilyData* internal_cfd_; @@ -218,6 +220,9 @@ struct SuperVersion { // enable UDT feature, this is an empty string. std::string full_history_ts_low; + // A shared copy of the DB's seqno to time mapping. + std::shared_ptr seqno_to_time_mapping{nullptr}; + // should be called outside the mutex SuperVersion() = default; ~SuperVersion(); @@ -231,8 +236,23 @@ struct SuperVersion { // that needs to be deleted in to_delete vector. Unrefing those // objects needs to be done in the mutex void Cleanup(); - void Init(ColumnFamilyData* new_cfd, MemTable* new_mem, - MemTableListVersion* new_imm, Version* new_current); + void Init( + ColumnFamilyData* new_cfd, MemTable* new_mem, + MemTableListVersion* new_imm, Version* new_current, + std::shared_ptr new_seqno_to_time_mapping); + + // Share the ownership of the seqno to time mapping object referred to in this + // SuperVersion. To be used by the new SuperVersion to be installed after this + // one if seqno to time mapping does not change in between these two + // SuperVersions. Or to share the ownership of the mapping with a FlushJob. + std::shared_ptr ShareSeqnoToTimeMapping() { + return seqno_to_time_mapping; + } + + // Access the seqno to time mapping object in this SuperVersion. + UnownedPtr GetSeqnoToTimeMapping() const { + return seqno_to_time_mapping.get(); + } // The value of dummy is not actually used. kSVInUse takes its address as a // mark in the thread local storage to indicate the SuperVersion is in use @@ -251,22 +271,21 @@ struct SuperVersion { autovector to_delete; }; -extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options); +Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options); -extern Status CheckConcurrentWritesSupported( - const ColumnFamilyOptions& cf_options); +Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options); -extern Status CheckCFPathsSupported(const DBOptions& db_options, - const ColumnFamilyOptions& cf_options); +Status CheckCFPathsSupported(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); -extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, - const ColumnFamilyOptions& src); +ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& src); // Wrap user defined table properties collector factories `from cf_options` -// into internal ones in int_tbl_prop_collector_factories. Add a system internal +// into internal ones in internal_tbl_prop_coll_factories. Add a system internal // one too. -extern void GetIntTblPropCollectorFactory( +void GetInternalTblPropCollFactory( const ImmutableCFOptions& ioptions, - IntTblPropCollectorFactories* int_tbl_prop_collector_factories); + InternalTblPropCollFactories* internal_tbl_prop_coll_factories); class ColumnFamilySet; @@ -402,7 +421,7 @@ class ColumnFamilyData { // duration of this function. // // Thread-safe - Status RangesOverlapWithMemtables(const autovector& ranges, + Status RangesOverlapWithMemtables(const autovector& ranges, SuperVersion* super_version, bool allow_data_in_errors, bool* overlap); @@ -431,8 +450,8 @@ class ColumnFamilyData { return internal_comparator_; } - const IntTblPropCollectorFactories* int_tbl_prop_collector_factories() const { - return &int_tbl_prop_collector_factories_; + const InternalTblPropCollFactories* internal_tbl_prop_coll_factories() const { + return &internal_tbl_prop_coll_factories_; } SuperVersion* GetSuperVersion() { return super_version_; } @@ -575,7 +594,7 @@ class ColumnFamilyData { std::atomic dropped_; // true if client dropped it const InternalKeyComparator internal_comparator_; - IntTblPropCollectorFactories int_tbl_prop_collector_factories_; + InternalTblPropCollFactories internal_tbl_prop_coll_factories_; const ColumnFamilyOptions initial_cf_options_; const ImmutableOptions ioptions_; @@ -855,17 +874,17 @@ class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { // REQUIRES: Seek() called first // REQUIRES: use this function of DBImpl::column_family_memtables_ should be // under a DB mutex OR from a write thread - virtual MemTable* GetMemTable() const override; + MemTable* GetMemTable() const override; // Returns column family handle for the selected column family // REQUIRES: use this function of DBImpl::column_family_memtables_ should be // under a DB mutex OR from a write thread - virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; + ColumnFamilyHandle* GetColumnFamilyHandle() override; // Cannot be called while another thread is calling Seek(). // REQUIRES: use this function of DBImpl::column_family_memtables_ should be // under a DB mutex OR from a write thread - virtual ColumnFamilyData* current() override { return current_; } + ColumnFamilyData* current() override { return current_; } private: ColumnFamilySet* column_family_set_; @@ -873,12 +892,11 @@ class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { ColumnFamilyHandleInternal handle_; }; -extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); +uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); -extern const Comparator* GetColumnFamilyUserComparator( +const Comparator* GetColumnFamilyUserComparator( ColumnFamilyHandle* column_family); -extern const ImmutableOptions& GetImmutableOptions( - ColumnFamilyHandle* column_family); +const ImmutableOptions& GetImmutableOptions(ColumnFamilyHandle* column_family); } // namespace ROCKSDB_NAMESPACE diff --git a/db/column_family_test.cc b/db/column_family_test.cc index 25bc0b36f61..90f66077cea 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -270,7 +270,7 @@ class ColumnFamilyTestBase : public testing::Test { void Reopen(const std::vector options = {}) { std::vector names; - for (auto name : names_) { + for (const auto& name : names_) { if (name != "") { names.push_back(name); } @@ -607,7 +607,7 @@ TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest) { // Preserve file system state up to here to simulate a crash condition. fault_env->SetFilesystemActive(false); std::vector names; - for (auto name : names_) { + for (const auto& name : names_) { if (name != "") { names.push_back(name); } @@ -669,7 +669,7 @@ TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest2) { // Preserve file system state up to here to simulate a crash condition. fault_env->SetFilesystemActive(false); std::vector names; - for (auto name : names_) { + for (const auto& name : names_) { if (name != "") { names.push_back(name); } @@ -1034,7 +1034,7 @@ TEST_P(ColumnFamilyTest, CrashAfterFlush) { fault_env->SetFilesystemActive(false); std::vector names; - for (auto name : names_) { + for (const auto& name : names_) { if (name != "") { names.push_back(name); } @@ -2604,90 +2604,92 @@ TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) { mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; mutable_cf_options.disable_auto_compactions = false; - vstorage->TEST_set_estimated_compaction_needed_bytes(50); + auto dbmu = dbfull()->TEST_Mutex(); + + vstorage->TEST_set_estimated_compaction_needed_bytes(50, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); - vstorage->TEST_set_estimated_compaction_needed_bytes(201); + vstorage->TEST_set_estimated_compaction_needed_bytes(201, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); - vstorage->TEST_set_estimated_compaction_needed_bytes(400); + vstorage->TEST_set_estimated_compaction_needed_bytes(400, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); - vstorage->TEST_set_estimated_compaction_needed_bytes(500); + vstorage->TEST_set_estimated_compaction_needed_bytes(500, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); - vstorage->TEST_set_estimated_compaction_needed_bytes(450); + vstorage->TEST_set_estimated_compaction_needed_bytes(450, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); - vstorage->TEST_set_estimated_compaction_needed_bytes(205); + vstorage->TEST_set_estimated_compaction_needed_bytes(205, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); - vstorage->TEST_set_estimated_compaction_needed_bytes(202); + vstorage->TEST_set_estimated_compaction_needed_bytes(202, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); - vstorage->TEST_set_estimated_compaction_needed_bytes(201); + vstorage->TEST_set_estimated_compaction_needed_bytes(201, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); - vstorage->TEST_set_estimated_compaction_needed_bytes(198); + vstorage->TEST_set_estimated_compaction_needed_bytes(198, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); - vstorage->TEST_set_estimated_compaction_needed_bytes(399); + vstorage->TEST_set_estimated_compaction_needed_bytes(399, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); - vstorage->TEST_set_estimated_compaction_needed_bytes(599); + vstorage->TEST_set_estimated_compaction_needed_bytes(599, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); - vstorage->TEST_set_estimated_compaction_needed_bytes(2001); + vstorage->TEST_set_estimated_compaction_needed_bytes(2001, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(IsDbWriteStopped()); ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); - vstorage->TEST_set_estimated_compaction_needed_bytes(3001); + vstorage->TEST_set_estimated_compaction_needed_bytes(3001, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(IsDbWriteStopped()); ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); - vstorage->TEST_set_estimated_compaction_needed_bytes(390); + vstorage->TEST_set_estimated_compaction_needed_bytes(390, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); - vstorage->TEST_set_estimated_compaction_needed_bytes(100); + vstorage->TEST_set_estimated_compaction_needed_bytes(100, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); @@ -2706,7 +2708,7 @@ TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) { ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); vstorage->set_l0_delay_trigger_count(0); - vstorage->TEST_set_estimated_compaction_needed_bytes(300); + vstorage->TEST_set_estimated_compaction_needed_bytes(300, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); @@ -2718,14 +2720,14 @@ TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) { ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.25 / 1.25 / 1.25, GetDbDelayedWriteRate()); - vstorage->TEST_set_estimated_compaction_needed_bytes(200); + vstorage->TEST_set_estimated_compaction_needed_bytes(200, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); vstorage->set_l0_delay_trigger_count(0); - vstorage->TEST_set_estimated_compaction_needed_bytes(0); + vstorage->TEST_set_estimated_compaction_needed_bytes(0, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); @@ -2744,7 +2746,7 @@ TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) { ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); vstorage->set_l0_delay_trigger_count(60); - vstorage->TEST_set_estimated_compaction_needed_bytes(300); + vstorage->TEST_set_estimated_compaction_needed_bytes(300, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); @@ -2753,14 +2755,14 @@ TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) { mutable_cf_options.disable_auto_compactions = false; vstorage->set_l0_delay_trigger_count(70); - vstorage->TEST_set_estimated_compaction_needed_bytes(500); + vstorage->TEST_set_estimated_compaction_needed_bytes(500, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); vstorage->set_l0_delay_trigger_count(71); - vstorage->TEST_set_estimated_compaction_needed_bytes(501); + vstorage->TEST_set_estimated_compaction_needed_bytes(501, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); @@ -2785,19 +2787,21 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) { mutable_cf_options.soft_pending_compaction_bytes_limit = 200; mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; - vstorage->TEST_set_estimated_compaction_needed_bytes(40); + auto dbmu = dbfull()->TEST_Mutex(); + + vstorage->TEST_set_estimated_compaction_needed_bytes(40, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); - vstorage->TEST_set_estimated_compaction_needed_bytes(50); + vstorage->TEST_set_estimated_compaction_needed_bytes(50, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); - vstorage->TEST_set_estimated_compaction_needed_bytes(300); + vstorage->TEST_set_estimated_compaction_needed_bytes(300, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); - vstorage->TEST_set_estimated_compaction_needed_bytes(45); + vstorage->TEST_set_estimated_compaction_needed_bytes(45, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); @@ -2853,53 +2857,55 @@ TEST_P(ColumnFamilyTest, WriteStallTwoColumnFamilies) { MutableCFOptions mutable_cf_options1 = mutable_cf_options; mutable_cf_options1.soft_pending_compaction_bytes_limit = 500; - vstorage->TEST_set_estimated_compaction_needed_bytes(50); + auto dbmu = dbfull()->TEST_Mutex(); + + vstorage->TEST_set_estimated_compaction_needed_bytes(50, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); - vstorage1->TEST_set_estimated_compaction_needed_bytes(201); + vstorage1->TEST_set_estimated_compaction_needed_bytes(201, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); - vstorage1->TEST_set_estimated_compaction_needed_bytes(600); + vstorage1->TEST_set_estimated_compaction_needed_bytes(600, dbmu); RecalculateWriteStallConditions(cfd1, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); - vstorage->TEST_set_estimated_compaction_needed_bytes(70); + vstorage->TEST_set_estimated_compaction_needed_bytes(70, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); - vstorage1->TEST_set_estimated_compaction_needed_bytes(800); + vstorage1->TEST_set_estimated_compaction_needed_bytes(800, dbmu); RecalculateWriteStallConditions(cfd1, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); - vstorage->TEST_set_estimated_compaction_needed_bytes(300); + vstorage->TEST_set_estimated_compaction_needed_bytes(300, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); - vstorage1->TEST_set_estimated_compaction_needed_bytes(700); + vstorage1->TEST_set_estimated_compaction_needed_bytes(700, dbmu); RecalculateWriteStallConditions(cfd1, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); - vstorage->TEST_set_estimated_compaction_needed_bytes(500); + vstorage->TEST_set_estimated_compaction_needed_bytes(500, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); - vstorage1->TEST_set_estimated_compaction_needed_bytes(600); + vstorage1->TEST_set_estimated_compaction_needed_bytes(600, dbmu); RecalculateWriteStallConditions(cfd1, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); @@ -2932,29 +2938,31 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) { MutableCFOptions mutable_cf_options1 = mutable_cf_options; mutable_cf_options1.level0_slowdown_writes_trigger = 16; - vstorage->TEST_set_estimated_compaction_needed_bytes(40); + auto dbmu = dbfull()->TEST_Mutex(); + + vstorage->TEST_set_estimated_compaction_needed_bytes(40, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); - vstorage->TEST_set_estimated_compaction_needed_bytes(60); + vstorage->TEST_set_estimated_compaction_needed_bytes(60, dbmu); RecalculateWriteStallConditions(cfd1, mutable_cf_options); ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); - vstorage1->TEST_set_estimated_compaction_needed_bytes(30); + vstorage1->TEST_set_estimated_compaction_needed_bytes(30, dbmu); RecalculateWriteStallConditions(cfd1, mutable_cf_options); ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); - vstorage1->TEST_set_estimated_compaction_needed_bytes(70); + vstorage1->TEST_set_estimated_compaction_needed_bytes(70, dbmu); RecalculateWriteStallConditions(cfd1, mutable_cf_options); ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); - vstorage->TEST_set_estimated_compaction_needed_bytes(20); + vstorage->TEST_set_estimated_compaction_needed_bytes(20, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); - vstorage1->TEST_set_estimated_compaction_needed_bytes(3); + vstorage1->TEST_set_estimated_compaction_needed_bytes(3, dbmu); RecalculateWriteStallConditions(cfd1, mutable_cf_options); ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); @@ -2971,6 +2979,116 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) { ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); } +TEST_P(ColumnFamilyTest, CompactionSpeedupForCompactionDebt) { + db_options_.max_background_compactions = 6; + Open(); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + MutableCFOptions mutable_cf_options(column_family_options_); + mutable_cf_options.soft_pending_compaction_bytes_limit = + std::numeric_limits::max(); + + auto dbmu = dbfull()->TEST_Mutex(); + + { + // No bottommost data, so debt ratio cannot trigger speedup. + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + vstorage->TEST_set_estimated_compaction_needed_bytes(1048576 /* 1MB */, + dbmu); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + } + + // Add a tiny amount of bottommost data. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "bar")); + ASSERT_OK(db_->Flush(FlushOptions())); + + { + // 1MB debt is way bigger than bottommost data so definitely triggers + // speedup. + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + vstorage->TEST_set_estimated_compaction_needed_bytes(1048576 /* 1MB */, + dbmu); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + // Eight bytes is way smaller than bottommost data so definitely does not + // trigger speedup. + vstorage->TEST_set_estimated_compaction_needed_bytes(8, dbmu); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + } +} + +TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) { + const int kParallelismLimit = 3; + class AlwaysCompactTpc : public TablePropertiesCollector { + public: + Status Finish(UserCollectedProperties* /* properties */) override { + return Status::OK(); + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + const char* Name() const override { return "AlwaysCompactTpc"; } + + bool NeedCompact() const override { return true; } + }; + + class AlwaysCompactTpcf : public TablePropertiesCollectorFactory { + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /* context */) override { + return new AlwaysCompactTpc(); + } + + const char* Name() const override { return "AlwaysCompactTpcf"; } + }; + + column_family_options_.num_levels = 2; + column_family_options_.table_properties_collector_factories.emplace_back( + std::make_shared()); + db_options_.max_background_compactions = kParallelismLimit; + Open(); + + // Make a nonempty last level. Only marked files in upper levels count. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "bar")); + ASSERT_OK(db_->Flush(FlushOptions())); + WaitForCompaction(); + AssertFilesPerLevel("0,1", 0 /* cf */); + + // Block the compaction thread pool so marked files accumulate in L0. + test::SleepingBackgroundTask sleeping_tasks[kParallelismLimit]; + for (int i = 0; i < kParallelismLimit; i++) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], Env::Priority::LOW); + sleeping_tasks[i].WaitUntilSleeping(); + } + + // Zero marked upper-level files. No speedup. + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + AssertFilesPerLevel("0,1", 0 /* cf */); + + // One marked upper-level file. No speedup. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "bar")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + AssertFilesPerLevel("1,1", 0 /* cf */); + + // Two marked upper-level files. Speedup. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "bar")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(kParallelismLimit, dbfull()->TEST_BGCompactionsAllowed()); + AssertFilesPerLevel("2,1", 0 /* cf */); + + for (int i = 0; i < kParallelismLimit; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } +} + TEST_P(ColumnFamilyTest, CreateAndDestroyOptions) { std::unique_ptr cfo(new ColumnFamilyOptions()); ColumnFamilyHandle* cfh; @@ -3289,9 +3407,13 @@ TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) { for (size_t i = 0; i < filenames.size(); i++) { uint64_t number; FileType type; - if (!(ParseFileName(filenames[i], &number, &type))) continue; + if (!(ParseFileName(filenames[i], &number, &type))) { + continue; + } - if (type != kWalFile) continue; + if (type != kWalFile) { + continue; + } logfs.push_back(filenames[i]); } diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index 2d53f2b992d..129b29c99ff 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -34,8 +34,8 @@ class CompactFilesTest : public testing::Test { // A class which remembers the name of each flushed file. class FlushedFileCollector : public EventListener { public: - FlushedFileCollector() {} - ~FlushedFileCollector() override {} + FlushedFileCollector() = default; + ~FlushedFileCollector() override = default; void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { std::lock_guard lock(mutex_); @@ -45,7 +45,7 @@ class FlushedFileCollector : public EventListener { std::vector GetFlushedFiles() { std::lock_guard lock(mutex_); std::vector result; - for (auto fname : flushed_files_) { + for (const auto& fname : flushed_files_) { result.push_back(fname); } return result; @@ -159,7 +159,9 @@ TEST_F(CompactFilesTest, MultipleLevel) { // Compact files except the file in L3 std::vector files; for (int i = 0; i < 6; ++i) { - if (i == 3) continue; + if (i == 3) { + continue; + } for (auto& file : meta.levels[i].files) { files.push_back(file.db_path + "/" + file.name); } @@ -228,7 +230,7 @@ TEST_F(CompactFilesTest, ObsoleteFiles) { ASSERT_OK(static_cast_with_check(db)->TEST_WaitForCompact()); // verify all compaction input files are deleted - for (auto fname : l0_files) { + for (const auto& fname : l0_files) { ASSERT_EQ(Status::NotFound(), env_->FileExists(fname)); } delete db; @@ -492,4 +494,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index bbab8f79fb5..d4c994f5f76 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -13,6 +13,7 @@ #include #include "db/column_family.h" +#include "db/dbformat.h" #include "logging/logging.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/sst_partitioner.h" @@ -21,9 +22,6 @@ namespace ROCKSDB_NAMESPACE { -const uint64_t kRangeTombstoneSentinel = - PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); - int sstableKeyCompare(const Comparator* uc, const Slice& a, const Slice& b) { auto c = uc->CompareWithoutTimestamp(ExtractUserKey(a), ExtractUserKey(b)); if (c != 0) { @@ -162,7 +160,9 @@ std::vector Compaction::PopulateWithAtomicBoundaries( AtomicCompactionUnitBoundary cur_boundary; size_t first_atomic_idx = 0; auto add_unit_boundary = [&](size_t to) { - if (first_atomic_idx == to) return; + if (first_atomic_idx == to) { + return; + } for (size_t k = first_atomic_idx; k < to; k++) { inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary); } @@ -352,11 +352,9 @@ Compaction::Compaction( // for the non-bottommost levels, it tries to build files match the target // file size, but not guaranteed. It could be 2x the size of the target size. - max_output_file_size_ = - bottommost_level_ || grandparents_.empty() || - !_immutable_options.level_compaction_dynamic_file_size - ? target_output_file_size_ - : 2 * target_output_file_size_; + max_output_file_size_ = bottommost_level_ || grandparents_.empty() + ? target_output_file_size_ + : 2 * target_output_file_size_; #ifndef NDEBUG for (size_t i = 1; i < inputs_.size(); ++i) { @@ -757,7 +755,9 @@ int InputSummary(const std::vector& files, char* output, AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16); ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", files.at(i)->fd.GetNumber(), sztxt); - if (ret < 0 || ret >= sz) break; + if (ret < 0 || ret >= sz) { + break; + } write += ret; } // if files.size() is non-zero, overwrite the last space diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index 50c75f70b22..e6251371f14 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -595,6 +595,6 @@ struct PerKeyPlacementContext { #endif /* !NDEBUG */ // Return sum of sizes of all files in `files`. -extern uint64_t TotalFileSize(const std::vector& files); +uint64_t TotalFileSize(const std::vector& files); } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_iteration_stats.h b/db/compaction/compaction_iteration_stats.h index 1b1c28b57ad..8777cf62764 100644 --- a/db/compaction/compaction_iteration_stats.h +++ b/db/compaction/compaction_iteration_stats.h @@ -44,6 +44,13 @@ struct CompactionIterationStats { uint64_t total_blob_bytes_read = 0; uint64_t num_blobs_relocated = 0; uint64_t total_blob_bytes_relocated = 0; + + // TimedPut diagnostics + // Total number of kTypeValuePreferredSeqno records encountered. + uint64_t num_input_timed_put_records = 0; + // Number of kTypeValuePreferredSeqno records we ended up swapping in + // preferred seqno. + uint64_t num_timed_put_swap_preferred_seqno = 0; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index 85d1c039bd3..ebda5a6ff77 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -491,6 +491,8 @@ void CompactionIterator::NextFromInput() { if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion || ikey_.type == kTypeDeletionWithTimestamp) { iter_stats_.num_input_deletion_records++; + } else if (ikey_.type == kTypeValuePreferredSeqno) { + iter_stats_.num_input_timed_put_records++; } iter_stats_.total_input_raw_key_bytes += key_.size(); iter_stats_.total_input_raw_value_bytes += value_.size(); @@ -618,7 +620,8 @@ void CompactionIterator::NextFromInput() { // not compact out. We will keep this Put, but can drop it's data. // (See Optimization 3, below.) if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex && - ikey_.type != kTypeWideColumnEntity) { + ikey_.type != kTypeWideColumnEntity && + ikey_.type != kTypeValuePreferredSeqno) { ROCKS_LOG_FATAL(info_log_, "Unexpected key %s for compaction output", ikey_.DebugString(allow_data_in_errors_, true).c_str()); assert(false); @@ -632,7 +635,8 @@ void CompactionIterator::NextFromInput() { assert(false); } - if (ikey_.type == kTypeBlobIndex || ikey_.type == kTypeWideColumnEntity) { + if (ikey_.type == kTypeBlobIndex || ikey_.type == kTypeWideColumnEntity || + ikey_.type == kTypeValuePreferredSeqno) { ikey_.type = kTypeValue; current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); } @@ -798,7 +802,8 @@ void CompactionIterator::NextFromInput() { // happened if (next_ikey.type != kTypeValue && next_ikey.type != kTypeBlobIndex && - next_ikey.type != kTypeWideColumnEntity) { + next_ikey.type != kTypeWideColumnEntity && + next_ikey.type != kTypeValuePreferredSeqno) { ++iter_stats_.num_single_del_mismatch; } @@ -968,6 +973,50 @@ void CompactionIterator::NextFromInput() { validity_info_.SetValid(ValidContext::kKeepDel); at_next_ = true; } + } else if (ikey_.type == kTypeValuePreferredSeqno && + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && + (bottommost_level_ || + (compaction_ != nullptr && + compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, + &level_ptrs_)))) { + // This section that attempts to swap preferred sequence number will not + // be invoked if this is a CompactionIterator created for flush, since + // `compaction_` will be nullptr and it's not bottommost either. + // + // The entries with the same user key and smaller sequence numbers are + // all in this earliest snapshot range to be iterated. Since those entries + // will be hidden by this entry [rule A], it's safe to swap in the + // preferred seqno now. + // + // It's otherwise not safe to swap in the preferred seqno since it's + // possible for entries in earlier snapshots to have sequence number that + // is smaller than this entry's sequence number but bigger than this + // entry's preferred sequence number. Swapping in the preferred sequence + // number will break the internal key ordering invariant for this key. + // + // A special case involving range deletion is handled separately below. + auto [unpacked_value, preferred_seqno] = + ParsePackedValueWithSeqno(value_); + assert(preferred_seqno < ikey_.sequence); + InternalKey ikey_after_swap(ikey_.user_key, preferred_seqno, kTypeValue); + Slice ikey_after_swap_slice(*ikey_after_swap.rep()); + if (range_del_agg_->ShouldDelete( + ikey_after_swap_slice, + RangeDelPositioningMode::kForwardTraversal)) { + // A range tombstone that doesn't cover this kTypeValuePreferredSeqno + // entry may end up covering the entry, so it's not safe to swap + // preferred sequence number. In this case, we output the entry as is. + validity_info_.SetValid(ValidContext::kNewUserKey); + } else { + iter_stats_.num_timed_put_swap_preferred_seqno++; + ikey_.sequence = preferred_seqno; + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + value_ = unpacked_value; + validity_info_.SetValid(ValidContext::kSwapPreferredSeqno); + } } else if (ikey_.type == kTypeMerge) { if (!merge_helper_->HasOperator()) { status_ = Status::InvalidArgument( diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index 1ff9c886924..eeb75efac4e 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -410,6 +410,7 @@ class CompactionIterator { kKeepDel = 9, kNewUserKey = 10, kRangeDeletion = 11, + kSwapPreferredSeqno = 12, }; struct ValidityInfo { diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc index 699e629693d..7558a3e5c75 100644 --- a/db/compaction/compaction_iterator_test.cc +++ b/db/compaction/compaction_iterator_test.cc @@ -17,6 +17,14 @@ #include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { +namespace { +std::string ValueWithPreferredSeqno(std::string val, + SequenceNumber preferred_seqno = 0) { + std::string result = val; + PutFixed64(&result, preferred_seqno); + return result; +} +} // namespace // Expects no merging attempts. class NoMergingMergeOp : public MergeOperator { @@ -392,6 +400,17 @@ TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) { ASSERT_FALSE(c_iter_->Valid()); } +// Tests compatibility of TimedPut and SingleDelete. TimedPut should act as if +// it's a Put. +TEST_P(CompactionIteratorTest, TimedPutAndSingleDelete) { + InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion), + test::KeyStr("a", 3, kTypeValuePreferredSeqno)}, + {"", "val"}, {}, {}, 5); + c_iter_->SeekToFirst(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + TEST_P(CompactionIteratorTest, SimpleRangeDeletion) { InitIterators({test::KeyStr("morning", 5, kTypeValue), test::KeyStr("morning", 2, kTypeValue), @@ -431,6 +450,31 @@ TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) { ASSERT_FALSE(c_iter_->Valid()); } +// Tests compatibility of TimedPut and Range delete. TimedPut should act as if +// it's a Put. +TEST_P(CompactionIteratorTest, TimedPutAndRangeDeletion) { + InitIterators( + {test::KeyStr("morning", 5, kTypeValuePreferredSeqno), + test::KeyStr("morning", 2, kTypeValuePreferredSeqno), + test::KeyStr("night", 3, kTypeValuePreferredSeqno)}, + {ValueWithPreferredSeqno("zao5"), ValueWithPreferredSeqno("zao2"), + ValueWithPreferredSeqno("wan")}, + {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 5); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("morning", 5, kTypeValuePreferredSeqno), + c_iter_->key().ToString()); + ASSERT_EQ(ValueWithPreferredSeqno("zao5"), c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("night", 3, kTypeValuePreferredSeqno), + c_iter_->key().ToString()); + ASSERT_EQ(ValueWithPreferredSeqno("wan"), c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) { class Filter : public CompactionFilter { Decision FilterV2(int /*level*/, const Slice& key, ValueType t, @@ -502,9 +546,11 @@ TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) { test::KeyStr("f", 25, kTypeValue), test::KeyStr("g", 90, kTypeValue), test::KeyStr("h", 91, kTypeValue), // keep test::KeyStr("i", 95, kTypeMerge), // skip to "z" - test::KeyStr("j", 99, kTypeValue)}, + test::KeyStr("j", 99, kTypeValue), + test::KeyStr("k", 100, kTypeValuePreferredSeqno)}, {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30", - "fv25", "gv90", "hv91", "im95", "jv99"}, + "fv25", "gv90", "hv91", "im95", "jv99", + ValueWithPreferredSeqno("kv100")}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter); // Compaction should output just "a", "e" and "h" keys. @@ -614,87 +660,87 @@ TEST_P(CompactionIteratorTest, ShuttingDownInMerge) { EXPECT_EQ(2, filter.last_seen.load()); } -TEST_P(CompactionIteratorTest, SingleMergeOperand) { - class Filter : public CompactionFilter { - Decision FilterV2(int /*level*/, const Slice& key, ValueType t, - const Slice& existing_value, std::string* /*new_value*/, - std::string* /*skip_until*/) const override { - std::string k = key.ToString(); - std::string v = existing_value.ToString(); - - // See InitIterators() call below for the sequence of keys and their - // filtering decisions. Here we closely assert that compaction filter is - // called with the expected keys and only them, and with the right values. - if (k == "a") { - EXPECT_EQ(ValueType::kMergeOperand, t); - EXPECT_EQ("av1", v); - return Decision::kKeep; - } else if (k == "b") { - EXPECT_EQ(ValueType::kMergeOperand, t); - return Decision::kKeep; - } else if (k == "c") { - return Decision::kKeep; - } - - ADD_FAILURE(); +class Filter : public CompactionFilter { + Decision FilterV2(int /*level*/, const Slice& key, ValueType t, + const Slice& existing_value, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + std::string k = key.ToString(); + std::string v = existing_value.ToString(); + + // See InitIterators() call below for the sequence of keys and their + // filtering decisions. Here we closely assert that compaction filter is + // called with the expected keys and only them, and with the right values. + if (k == "a") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("av1", v); + return Decision::kKeep; + } else if (k == "b") { + EXPECT_EQ(ValueType::kMergeOperand, t); + return Decision::kKeep; + } else if (k == "c") { return Decision::kKeep; } - const char* Name() const override { - return "CompactionIteratorTest.SingleMergeOperand::Filter"; - } - }; + ADD_FAILURE(); + return Decision::kKeep; + } - class SingleMergeOp : public MergeOperator { - public: - bool FullMergeV2(const MergeOperationInput& merge_in, - MergeOperationOutput* merge_out) const override { - // See InitIterators() call below for why "c" is the only key for which - // FullMergeV2 should be called. - EXPECT_EQ("c", merge_in.key.ToString()); - - std::string temp_value; - if (merge_in.existing_value != nullptr) { - temp_value = merge_in.existing_value->ToString(); - } + const char* Name() const override { + return "CompactionIteratorTest.SingleMergeOperand::Filter"; + } +}; - for (auto& operand : merge_in.operand_list) { - temp_value.append(operand.ToString()); - } - merge_out->new_value = temp_value; +class SingleMergeOp : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + // See InitIterators() call below for why "c" is the only key for which + // FullMergeV2 should be called. + EXPECT_EQ("c", merge_in.key.ToString()); + + std::string temp_value; + if (merge_in.existing_value != nullptr) { + temp_value = merge_in.existing_value->ToString(); + } - return true; + for (auto& operand : merge_in.operand_list) { + temp_value.append(operand.ToString()); } + merge_out->new_value = temp_value; - bool PartialMergeMulti(const Slice& key, - const std::deque& operand_list, - std::string* new_value, - Logger* /*logger*/) const override { - std::string string_key = key.ToString(); - EXPECT_TRUE(string_key == "a" || string_key == "b"); - - if (string_key == "a") { - EXPECT_EQ(1, operand_list.size()); - } else if (string_key == "b") { - EXPECT_EQ(2, operand_list.size()); - } + return true; + } - std::string temp_value; - for (auto& operand : operand_list) { - temp_value.append(operand.ToString()); - } - swap(temp_value, *new_value); + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const override { + std::string string_key = key.ToString(); + EXPECT_TRUE(string_key == "a" || string_key == "b"); - return true; + if (string_key == "a") { + EXPECT_EQ(1, operand_list.size()); + } else if (string_key == "b") { + EXPECT_EQ(2, operand_list.size()); } - const char* Name() const override { - return "CompactionIteratorTest SingleMergeOp"; + std::string temp_value; + for (auto& operand : operand_list) { + temp_value.append(operand.ToString()); } + swap(temp_value, *new_value); - bool AllowSingleOperand() const override { return true; } - }; + return true; + } + + const char* Name() const override { + return "CompactionIteratorTest SingleMergeOp"; + } + bool AllowSingleOperand() const override { return true; } +}; + +TEST_P(CompactionIteratorTest, SingleMergeOperand) { SingleMergeOp merge_op; Filter filter; InitIterators( @@ -719,6 +765,24 @@ TEST_P(CompactionIteratorTest, SingleMergeOperand) { ASSERT_EQ("cv1cv2", c_iter_->value().ToString()); } +// Tests compatibility of TimedPut and Merge operation. When a TimedPut is +// merged with some merge operand in compaction, it will become a regular Put +// and lose its preferred sequence number. +TEST_P(CompactionIteratorTest, TimedPutAndMerge) { + SingleMergeOp merge_op; + Filter filter; + InitIterators({test::KeyStr("c", 90, kTypeMerge), + test::KeyStr("c", 80, kTypeValuePreferredSeqno)}, + {"cv2", ValueWithPreferredSeqno("cv1")}, {}, {}, + kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter); + + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("c", 90, kTypeValue), c_iter_->key().ToString()); + ASSERT_OK(c_iter_->status()); + ASSERT_EQ("cv1cv2", c_iter_->value().ToString()); +} + // In bottommost level, values earlier than earliest snapshot can be output // with sequence = 0. TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) { @@ -963,6 +1027,22 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) { {"v4", "v3", "v1"}, 3 /*last_committed_seq*/); } +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_TimedPut) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("foo", 4, kTypeValuePreferredSeqno), + test::KeyStr("foo", 3, kTypeValuePreferredSeqno), + test::KeyStr("foo", 2, kTypeValuePreferredSeqno), + test::KeyStr("foo", 1, kTypeValuePreferredSeqno)}, + {ValueWithPreferredSeqno("v4"), ValueWithPreferredSeqno("v3"), + ValueWithPreferredSeqno("v2"), ValueWithPreferredSeqno("v1")}, + {test::KeyStr("foo", 4, kTypeValuePreferredSeqno), + test::KeyStr("foo", 3, kTypeValuePreferredSeqno), + test::KeyStr("foo", 1, kTypeValuePreferredSeqno)}, + {ValueWithPreferredSeqno("v4"), ValueWithPreferredSeqno("v3"), + ValueWithPreferredSeqno("v1")}, + 3 /*last_committed_seq*/); +} + TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) { AddSnapshot(2, 1); RunTest( @@ -1128,6 +1208,114 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest, 2 /* earliest_write_conflict_snapshot */); } +// Same as above but with a value with preferred seqno entry. In addition to the +// value getting trimmed, the type of the KV is changed to kTypeValue. +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + KeepSingleDeletionForWriteConflictChecking_TimedPut) { + AddSnapshot(2, 0); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValuePreferredSeqno)}, + {"", ValueWithPreferredSeqno("v1")}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", ""}, 2 /* last_committed_seq */, nullptr /* merge_operator */, + nullptr /* compaction_filter */, false /* bottommost_level */, + 2 /* earliest_write_conflict_snapshot */); +} + +// Tests when a kTypeValuePreferredSeqno entry can have its preferred sequence +// number swapped in. The required and sufficient conditions for an entry's +// preferred sequence number to get swapped in are: +// 1) The entry is visible to the earliest snapshot, AND +// 2) No more entries with the same user key on lower levels, AND +// This is either because: +// 2a) This is a compaction to the bottommost level, OR +// 2b) Keys do not exist beyond output level +// 3) The entry will not resurface a range deletion entry after swapping in the +// preferred sequence number. +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + TimedPut_NotVisibleToEarliestSnapshot_NoSwapPreferredSeqno) { + AddSnapshot(3); + RunTest({test::KeyStr("bar", 5, kTypeValuePreferredSeqno)}, + {ValueWithPreferredSeqno("bv2", 2)}, + {test::KeyStr("bar", 5, kTypeValuePreferredSeqno)}, + {ValueWithPreferredSeqno("bv2", 2), "bv1"}, 5 /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/, + kMaxSequenceNumber /*earliest_write_conflict_snapshot*/, + true /*key_not_exists_beyond_output_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + TimedPut_MoreEntriesInLowerLevels_NoSwapPreferredSeqno) { + // This tests mimics more entries in lower levels with `bottommost_level` and + // `key_not_exists_beyond_output_level` set to false. + RunTest({test::KeyStr("bar", 5, kTypeValuePreferredSeqno)}, + {ValueWithPreferredSeqno("bv2", 2)}, + {test::KeyStr("bar", 5, kTypeValuePreferredSeqno)}, + {ValueWithPreferredSeqno("bv2", 2)}, 5 /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + false /*bottommost_level*/, + kMaxSequenceNumber /*earliest_write_conflict_snapshot*/, + false /*key_not_exists_beyond_output_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + TimedPut_WillBeHiddenByRangeDeletionAfterSwap_NoSwap) { + InitIterators({test::KeyStr("morning", 5, kTypeValuePreferredSeqno), + test::KeyStr("night", 6, kTypeValue)}, + {ValueWithPreferredSeqno("zao", 3), "wan"}, + {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 6, + kMaxSequenceNumber /*last_committed_sequence*/, + nullptr /*merge_op*/, nullptr /*filter*/, + false /*bottommost_level*/, + kMaxSequenceNumber /*earliest_write_conflict_snapshot*/, + true /*key_not_exists_beyond_output_level*/); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("morning", 5, kTypeValuePreferredSeqno), + c_iter_->key().ToString()); + ASSERT_EQ(ValueWithPreferredSeqno("zao", 3), c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("night", 6, kTypeValue), c_iter_->key().ToString()); + ASSERT_EQ("wan", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_FALSE(c_iter_->Valid()); + ASSERT_OK(c_iter_->status()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + TimedPut_BottomMostLevelVisibleToEarliestSnapshot_SwapPreferredSeqno) { + // Preferred seqno got swapped in and also zeroed out as a bottommost level + // optimization. + RunTest( + {test::KeyStr("bar", 5, kTypeValuePreferredSeqno), + test::KeyStr("bar", 4, kTypeValuePreferredSeqno), + test::KeyStr("foo", 6, kTypeValue)}, + {ValueWithPreferredSeqno("bv2", 2), ValueWithPreferredSeqno("bv1", 1), + "fv1"}, + {test::KeyStr("bar", 0, kTypeValue), test::KeyStr("foo", 0, kTypeValue)}, + {"bv2", "fv1"}, 6 /*last_committed_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, true /*bottommost_level*/); +} + +TEST_F( + CompactionIteratorWithSnapshotCheckerTest, + TimedPut_NonBottomMostLevelVisibleToEarliestSnapshot_SwapPreferredSeqno) { + RunTest( + {test::KeyStr("bar", 5, kTypeValuePreferredSeqno), + test::KeyStr("bar", 4, kTypeValuePreferredSeqno), + test::KeyStr("foo", 6, kTypeValue)}, + {ValueWithPreferredSeqno("bv2", 2), ValueWithPreferredSeqno("bv1", 1), + "fv1"}, + {test::KeyStr("bar", 2, kTypeValue), test::KeyStr("foo", 6, kTypeValue)}, + {"bv2", "fv1"}, 6 /*last_committed_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, false /*bottommost_level*/, + kMaxSequenceNumber /*earliest_write_conflict_snapshot*/, + true /*key_not_exists_beyond_output_level*/); +} + // Compaction filter should keep uncommitted key as-is, and // * Convert the latest value to deletion, and/or // * if latest value is a merge, apply filter to all subsequent merges. @@ -1145,6 +1333,22 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) { nullptr /*merge_operator*/, compaction_filter.get()); } +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_TimedPut) { + // TODO(yuzhangyu): Add support for this type in compaction filter. + // Type kTypeValuePreferredSeqno is not explicitly exposed in the compaction + // filter API, so users can not operate on it through compaction filter API + // to remove/purge/change value etc. But this type of entry can be impacted by + // other entries' filter result, currently only kRemoveAndSkip type of result + // can affect it. + std::unique_ptr compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest({test::KeyStr("a", 2, kTypeValuePreferredSeqno)}, + {ValueWithPreferredSeqno("v1")}, + {test::KeyStr("a", 2, kTypeValuePreferredSeqno)}, + {ValueWithPreferredSeqno("v1")}, 2 /*last_committed_seq*/, + nullptr /*merge_operator*/, compaction_filter.get()); +} + TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) { std::unique_ptr compaction_filter( new FilterAllKeysCompactionFilter()); diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index d0ff1d14566..cfa6aba8012 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -288,39 +288,37 @@ void CompactionJob::Prepare() { if (preserve_time_duration > 0) { const ReadOptions read_options(Env::IOActivity::kCompaction); - // setup seqno_to_time_mapping_ - seqno_to_time_mapping_.SetMaxTimeDuration(preserve_time_duration); + // Setup seqno_to_time_mapping_ with relevant time range. + seqno_to_time_mapping_.SetMaxTimeSpan(preserve_time_duration); for (const auto& each_level : *c->inputs()) { for (const auto& fmd : each_level.files) { std::shared_ptr tp; Status s = cfd->current()->GetTableProperties(read_options, &tp, fmd, nullptr); if (s.ok()) { - seqno_to_time_mapping_.Add(tp->seqno_to_time_mapping) - .PermitUncheckedError(); - seqno_to_time_mapping_.Add(fmd->fd.smallest_seqno, - fmd->oldest_ancester_time); + s = seqno_to_time_mapping_.DecodeFrom(tp->seqno_to_time_mapping); + } + if (!s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Problem reading or processing seqno-to-time mapping: %s", + s.ToString().c_str()); } } } - auto status = seqno_to_time_mapping_.Sort(); - if (!status.ok()) { - ROCKS_LOG_WARN(db_options_.info_log, - "Invalid sequence number to time mapping: Status: %s", - status.ToString().c_str()); - } int64_t _current_time = 0; - status = db_options_.clock->GetCurrentTime(&_current_time); - if (!status.ok()) { + Status s = db_options_.clock->GetCurrentTime(&_current_time); + if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Failed to get current time in compaction: Status: %s", - status.ToString().c_str()); + s.ToString().c_str()); // preserve all time information preserve_time_min_seqno_ = 0; preclude_last_level_min_seqno_ = 0; + seqno_to_time_mapping_.Enforce(); } else { - seqno_to_time_mapping_.TruncateOldEntries(_current_time); + seqno_to_time_mapping_.Enforce(_current_time); uint64_t preserve_time = static_cast(_current_time) > preserve_time_duration ? _current_time - preserve_time_duration @@ -344,6 +342,16 @@ void CompactionJob::Prepare() { 1; } } + // For accuracy of the GetProximalSeqnoBeforeTime queries above, we only + // limit the capacity after them. + // Here If we set capacity to the per-SST limit, we could be throwing away + // fidelity when a compaction output file has a narrower seqno range than + // all the inputs. If we only limit capacity for each compaction output, we + // could be doing a lot of unnecessary recomputation in a large compaction + // (up to quadratic in number of files). Thus, we do soemthing in the + // middle: enforce a resonably large constant size limit substantially + // larger than kMaxSeqnoTimePairsPerSST. + seqno_to_time_mapping_.SetCapacity(kMaxSeqnoToTimeEntries); } } @@ -396,7 +404,9 @@ void CompactionJob::AcquireSubcompactionResources( void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) { // Do nothing when we have zero resources to shrink - if (num_extra_resources == 0) return; + if (num_extra_resources == 0) { + return; + } db_mutex_->Lock(); // We cannot release threads more than what we reserved before int extra_num_subcompaction_threads_released = env_->ReleaseThreads( @@ -442,14 +452,6 @@ void CompactionJob::ReleaseSubcompactionResources() { ShrinkSubcompactionResources(extra_num_subcompaction_threads_reserved_); } -struct RangeWithSize { - Range range; - uint64_t size; - - RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0) - : range(a, b), size(s) {} -}; - void CompactionJob::GenSubcompactionBoundaries() { // The goal is to find some boundary keys so that we can evenly partition // the compaction input data into max_subcompactions ranges. @@ -476,7 +478,8 @@ void CompactionJob::GenSubcompactionBoundaries() { // overlap with N-1 other ranges. Since we requested a relatively large number // (128) of ranges from each input files, even N range overlapping would // cause relatively small inaccuracy. - const ReadOptions read_options(Env::IOActivity::kCompaction); + ReadOptions read_options(Env::IOActivity::kCompaction); + read_options.rate_limiter_priority = GetRateLimiterPriority(); auto* c = compact_->compaction; if (c->max_subcompactions() <= 1 && !(c->immutable_options()->compaction_pri == kRoundRobin && @@ -583,7 +586,9 @@ void CompactionJob::GenSubcompactionBoundaries() { TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:0", &num_planned_subcompactions); - if (num_planned_subcompactions == 1) return; + if (num_planned_subcompactions == 1) { + return; + } // Group the ranges into subcompactions uint64_t target_range_size = std::max( @@ -640,7 +645,7 @@ Status CompactionJob::Run() { // Always schedule the first subcompaction (whether or not there are also // others) in the current thread to be efficient with resources - ProcessKeyValueCompaction(&compact_->sub_compact_states[0]); + ProcessKeyValueCompaction(compact_->sub_compact_states.data()); // Wait for all other threads (if there are any) to finish execution for (auto& thread : thread_pool) { @@ -728,8 +733,9 @@ Status CompactionJob::Run() { // use_direct_io_for_flush_and_compaction is true, we will regard this // verification as user reads since the goal is to cache it here for // further user reads - const ReadOptions verify_table_read_options( - Env::IOActivity::kCompaction); + ReadOptions verify_table_read_options(Env::IOActivity::kCompaction); + verify_table_read_options.rate_limiter_priority = + GetRateLimiterPriority(); InternalIterator* iter = cfd->table_cache()->NewIterator( verify_table_read_options, file_options_, cfd->internal_comparator(), files_output[file_idx]->meta, @@ -750,7 +756,6 @@ Status CompactionJob::Run() { if (s.ok() && paranoid_file_checks_) { OutputValidator validator(cfd->internal_comparator(), - /*_enable_order_check=*/true, /*_enable_hash=*/true); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { s = validator.Add(iter->key(), iter->value()); @@ -821,24 +826,27 @@ Status CompactionJob::Run() { // input keys. So the number of keys it processed is not suitable for // verification here. // TODO: support verification when trim_ts_ is non-empty. - if (!(ts_sz > 0 && !trim_ts_.empty()) && - db_options_.compaction_verify_record_count) { + if (!(ts_sz > 0 && !trim_ts_.empty())) { assert(compaction_stats_.stats.num_input_records > 0); // TODO: verify the number of range deletion entries. uint64_t expected = compaction_stats_.stats.num_input_records - num_input_range_del; uint64_t actual = compaction_job_stats_->num_input_records; if (expected != actual) { + char scratch[2345]; + compact_->compaction->Summary(scratch, sizeof(scratch)); std::string msg = - "Total number of input records: " + std::to_string(expected) + - ", but processed " + std::to_string(actual) + " records."; + "Compaction number of input keys does not match " + "number of keys processed. Expected " + + std::to_string(expected) + " but processed " + + std::to_string(actual) + ". Compaction summary: " + scratch; ROCKS_LOG_WARN( - db_options_.info_log, "[%s] [JOB %d] Compaction %s", + db_options_.info_log, "[%s] [JOB %d] Compaction with status: %s", compact_->compaction->column_family_data()->GetName().c_str(), job_context_->job_id, msg.c_str()); - status = Status::Corruption( - "Compaction number of input keys does not match number of keys " - "processed."); + if (db_options_.compaction_verify_record_count) { + status = Status::Corruption(msg); + } } } } @@ -1130,6 +1138,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // (b) CompactionFilter::Decision::kRemoveAndSkipUntil. read_options.total_order_seek = true; + const WriteOptions write_options(Env::IOPriority::IO_LOW, + Env::IOActivity::kCompaction); + // Remove the timestamps from boundaries because boundaries created in // GenSubcompactionBoundaries doesn't strip away the timestamp. size_t ts_sz = cfd->user_comparator()->timestamp_size(); @@ -1265,18 +1276,17 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { ? new BlobFileBuilder( versions_, fs_.get(), sub_compact->compaction->immutable_options(), - mutable_cf_options, &file_options_, db_id_, db_session_id_, - job_id_, cfd->GetID(), cfd->GetName(), Env::IOPriority::IO_LOW, + mutable_cf_options, &file_options_, &write_options, db_id_, + db_session_id_, job_id_, cfd->GetID(), cfd->GetName(), write_hint_, io_tracer_, blob_callback_, BlobFileCreationReason::kCompaction, &blob_file_paths, sub_compact->Current().GetBlobFileAdditionsPtr()) : nullptr); TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); - TEST_SYNC_POINT_CALLBACK( - "CompactionJob::Run():PausingManualCompaction:1", - reinterpret_cast( - const_cast*>(&manual_compaction_canceled_))); + TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:1", + static_cast(const_cast*>( + &manual_compaction_canceled_))); const std::string* const full_history_ts_low = full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_; @@ -1324,8 +1334,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { Status status; TEST_SYNC_POINT_CALLBACK( "CompactionJob::ProcessKeyValueCompaction()::Processing", - reinterpret_cast( - const_cast(sub_compact->compaction))); + static_cast(const_cast(sub_compact->compaction))); uint64_t last_cpu_micros = prev_cpu_micros; while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) { // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid() @@ -1356,10 +1365,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { break; } - TEST_SYNC_POINT_CALLBACK( - "CompactionJob::Run():PausingManualCompaction:2", - reinterpret_cast( - const_cast*>(&manual_compaction_canceled_))); + TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:2", + static_cast(const_cast*>( + &manual_compaction_canceled_))); c_iter->Next(); if (c_iter->status().IsManualCompactionPaused()) { break; @@ -1712,6 +1720,8 @@ Status CompactionJob::InstallCompactionResults( db_mutex_->AssertHeld(); const ReadOptions read_options(Env::IOActivity::kCompaction); + const WriteOptions write_options(Env::IOActivity::kCompaction); + auto* compaction = compact_->compaction; assert(compaction); @@ -1794,8 +1804,9 @@ Status CompactionJob::InstallCompactionResults( }; return versions_->LogAndApply( - compaction->column_family_data(), mutable_cf_options, read_options, edit, - db_mutex_, db_directory_, /*new_descriptor_log=*/false, + compaction->column_family_data(), mutable_cf_options, read_options, + write_options, edit, db_mutex_, db_directory_, + /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, manifest_wcb); } @@ -1845,13 +1856,14 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, // Pass temperature of the last level files to FileSystem. FileOptions fo_copy = file_options_; Temperature temperature = sub_compact->compaction->output_temperature(); - // only set for the last level compaction and also it's not output to - // penultimate level (when preclude_last_level feature is enabled) - if (temperature == Temperature::kUnknown && + Temperature last_level_temp = + sub_compact->compaction->mutable_cf_options()->last_level_temperature; + // Here last_level_temperature supersedes default_write_temperature, when + // enabled and applicable + if (last_level_temp != Temperature::kUnknown && sub_compact->compaction->is_last_level() && !sub_compact->IsCurrentPenultimateLevel()) { - temperature = - sub_compact->compaction->mutable_cf_options()->last_level_temperature; + temperature = last_level_temp; } fo_copy.temperature = temperature; @@ -1931,8 +1943,6 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, } outputs.AddOutput(std::move(meta), cfd->internal_comparator(), - sub_compact->compaction->mutable_cf_options() - ->check_flush_compaction_key_order, paranoid_file_checks_); } @@ -1945,13 +1955,17 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, sub_compact->compaction->immutable_options()->listeners; outputs.AssignFileWriter(new WritableFileWriter( std::move(writable_file), fname, fo_copy, db_options_.clock, io_tracer_, - db_options_.stats, listeners, db_options_.file_checksum_gen_factory.get(), + db_options_.stats, Histograms::SST_WRITE_MICROS, listeners, + db_options_.file_checksum_gen_factory.get(), tmp_set.Contains(FileType::kTableFile), false)); // TODO(hx235): pass in the correct `oldest_key_time` instead of `0` + const ReadOptions read_options(Env::IOActivity::kCompaction); + const WriteOptions write_options(Env::IOActivity::kCompaction); TableBuilderOptions tboptions( *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()), - cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), + read_options, write_options, cfd->internal_comparator(), + cfd->internal_tbl_prop_coll_factories(), sub_compact->compaction->output_compression(), sub_compact->compaction->output_compression_opts(), cfd->GetID(), cfd->GetName(), sub_compact->compaction->output_level(), diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index e812cfc72a3..caa1593e72d 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -41,7 +41,6 @@ #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/transaction_log.h" -#include "table/scoped_arena_iterator.h" #include "util/autovector.h" #include "util/stop_watch.h" #include "util/thread_local.h" diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc index 56fc51d0582..69099451976 100644 --- a/db/compaction/compaction_job_stats_test.cc +++ b/db/compaction/compaction_job_stats_test.cc @@ -46,7 +46,6 @@ #include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" #include "table/plain/plain_table_factory.h" -#include "table/scoped_arena_iterator.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -131,7 +130,7 @@ class CompactionJobStatsTest : public testing::Test, ColumnFamilyOptions cf_opts(options); size_t cfi = handles_.size(); handles_.resize(cfi + cfs.size()); - for (auto cf : cfs) { + for (const auto& cf : cfs) { ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); } } @@ -160,7 +159,7 @@ class CompactionJobStatsTest : public testing::Test, EXPECT_EQ(cfs.size(), options.size()); std::vector column_families; for (size_t i = 0; i < cfs.size(); ++i) { - column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i])); + column_families.emplace_back(cfs[i], options[i]); } DBOptions db_opts = DBOptions(options[0]); return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index a1689111002..11a757fd68d 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -217,7 +217,7 @@ class CompactionJobTestBase : public testing::Test { /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)), + /*error_handler=*/nullptr, /*read_only=*/false)), shutting_down_(false), mock_table_factory_(new mock::MockTableFactory()), error_handler_(nullptr, db_options_, &mutex_), @@ -295,17 +295,20 @@ class CompactionJobTestBase : public testing::Test { Status s = WritableFileWriter::Create(fs_, table_name, FileOptions(), &file_writer, nullptr); ASSERT_OK(s); + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr table_builder( cf_options_.table_factory->NewTableBuilder( TableBuilderOptions(*cfd_->ioptions(), mutable_cf_options_, + read_options, write_options, cfd_->internal_comparator(), - cfd_->int_tbl_prop_collector_factories(), + cfd_->internal_tbl_prop_coll_factories(), CompressionType::kNoCompression, CompressionOptions(), 0 /* column_family_id */, kDefaultColumnFamilyName, -1 /* level */), file_writer.get())); // Build table. - for (auto kv : contents) { + for (const auto& kv : contents) { std::string key; std::string value; std::tie(key, value) = kv; @@ -324,7 +327,7 @@ class CompactionJobTestBase : public testing::Test { SequenceNumber smallest_seqno = kMaxSequenceNumber; SequenceNumber largest_seqno = 0; uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; - for (auto kv : contents) { + for (const auto& kv : contents) { ParsedInternalKey key; std::string skey; std::string value; @@ -394,7 +397,7 @@ class CompactionJobTestBase : public testing::Test { mutex_.Lock(); EXPECT_OK(versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - read_options_, &edit, &mutex_, nullptr)); + read_options_, write_options_, &edit, &mutex_, nullptr)); mutex_.Unlock(); } @@ -547,9 +550,9 @@ class CompactionJobTestBase : public testing::Test { &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); compaction_job_stats_.Reset(); - ASSERT_OK(SetIdentityFile(env_, dbname_)); + ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); VersionEdit new_db; new_db.SetLogNumber(0); @@ -568,11 +571,11 @@ class CompactionJobTestBase : public testing::Test { log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); - s = log.AddRecord(record); + s = log.AddRecord(WriteOptions(), record); } ASSERT_OK(s); // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); @@ -736,6 +739,7 @@ class CompactionJobTestBase : public testing::Test { MutableCFOptions mutable_cf_options_; MutableDBOptions mutable_db_options_; const ReadOptions read_options_; + const WriteOptions write_options_; std::shared_ptr table_cache_; WriteController write_controller_; WriteBufferManager write_buffer_manager_; @@ -1469,7 +1473,7 @@ TEST_F(CompactionJobTest, OldestBlobFileNumber) { } TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) { - cf_options_.bottommost_temperature = Temperature::kCold; + cf_options_.last_level_temperature = Temperature::kCold; SyncPoint::GetInstance()->SetCallBack( "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { auto supports_per_key_placement = static_cast(arg); @@ -1748,23 +1752,9 @@ TEST_F(CompactionJobTest, ResultSerialization) { } } -class CompactionJobDynamicFileSizeTest - : public CompactionJobTestBase, - public ::testing::WithParamInterface { - public: - CompactionJobDynamicFileSizeTest() - : CompactionJobTestBase( - test::PerThreadDBPath("compaction_job_dynamic_file_size_test"), - BytewiseComparator(), [](uint64_t /*ts*/) { return ""; }, - /*test_io_priority=*/false, TableTypeForTest::kMockTable) {} -}; - -TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytes) { +TEST_F(CompactionJobTest, CutForMaxCompactionBytes) { // dynamic_file_size option should have no impact on cutting for max // compaction bytes. - bool enable_dyanmic_file_size = GetParam(); - cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; - NewDB(); mutable_cf_options_.target_file_size_base = 80; mutable_cf_options_.max_compaction_bytes = 21; @@ -1838,10 +1828,7 @@ TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytes) { {expected_file1, expected_file2}); } -TEST_P(CompactionJobDynamicFileSizeTest, CutToSkipGrandparentFile) { - bool enable_dyanmic_file_size = GetParam(); - cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; - +TEST_F(CompactionJobTest, CutToSkipGrandparentFile) { NewDB(); // Make sure the grandparent level file size (10) qualifies skipping. // Currently, it has to be > 1/8 of target file size. @@ -1876,28 +1863,15 @@ TEST_P(CompactionJobDynamicFileSizeTest, CutToSkipGrandparentFile) { mock::MakeMockFile({{KeyStr("x", 4U, kTypeValue), "val"}, {KeyStr("z", 6U, kTypeValue), "val3"}}); - auto expected_file_disable_dynamic_file_size = - mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"}, - {KeyStr("c", 3U, kTypeValue), "val"}, - {KeyStr("x", 4U, kTypeValue), "val"}, - {KeyStr("z", 6U, kTypeValue), "val3"}}); - SetLastSequence(6U); const std::vector input_levels = {0, 1}; auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); - if (enable_dyanmic_file_size) { RunCompaction({lvl0_files, lvl1_files}, input_levels, {expected_file1, expected_file2}); - } else { - RunCompaction({lvl0_files, lvl1_files}, input_levels, - {expected_file_disable_dynamic_file_size}); - } } -TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundary) { - bool enable_dyanmic_file_size = GetParam(); - cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; +TEST_F(CompactionJobTest, CutToAlignGrandparentBoundary) { NewDB(); // MockTable has 1 byte per entry by default and each file is 10 bytes. @@ -1964,40 +1938,15 @@ TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundary) { } expected_file2.emplace_back(KeyStr("s", 4U, kTypeValue), "val"); - mock::KVVector expected_file_disable_dynamic_file_size1; - for (char i = 0; i < 10; i++) { - expected_file_disable_dynamic_file_size1.emplace_back( - KeyStr(std::string(1, ch + i), i + 10, kTypeValue), - "val" + std::to_string(i)); - } - - mock::KVVector expected_file_disable_dynamic_file_size2; - for (char i = 10; i < 12; i++) { - expected_file_disable_dynamic_file_size2.emplace_back( - KeyStr(std::string(1, ch + i), i + 10, kTypeValue), - "val" + std::to_string(i)); - } - - expected_file_disable_dynamic_file_size2.emplace_back( - KeyStr("s", 4U, kTypeValue), "val"); - SetLastSequence(22U); const std::vector input_levels = {0, 1}; auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); - if (enable_dyanmic_file_size) { RunCompaction({lvl0_files, lvl1_files}, input_levels, {expected_file1, expected_file2}); - } else { - RunCompaction({lvl0_files, lvl1_files}, input_levels, - {expected_file_disable_dynamic_file_size1, - expected_file_disable_dynamic_file_size2}); - } } -TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundarySameKey) { - bool enable_dyanmic_file_size = GetParam(); - cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; +TEST_F(CompactionJobTest, CutToAlignGrandparentBoundarySameKey) { NewDB(); // MockTable has 1 byte per entry by default and each file is 10 bytes. @@ -2034,13 +1983,9 @@ TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundarySameKey) { AddMockFile(file5, 2); mock::KVVector expected_file1; - mock::KVVector expected_file_disable_dynamic_file_size; - for (int i = 0; i < 8; i++) { expected_file1.emplace_back(KeyStr("a", 100 - i, kTypeValue), "val" + std::to_string(100 - i)); - expected_file_disable_dynamic_file_size.emplace_back( - KeyStr("a", 100 - i, kTypeValue), "val" + std::to_string(100 - i)); } // make sure `b` is cut in a separated file (so internally it's not using @@ -2049,9 +1994,6 @@ TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundarySameKey) { auto expected_file2 = mock::MakeMockFile({{KeyStr("b", 90U, kTypeValue), "valb"}}); - expected_file_disable_dynamic_file_size.emplace_back( - KeyStr("b", 90U, kTypeValue), "valb"); - SetLastSequence(122U); const std::vector input_levels = {0, 1}; auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); @@ -2062,20 +2004,13 @@ TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundarySameKey) { for (int i = 80; i <= 100; i++) { snapshots.emplace_back(i); } - if (enable_dyanmic_file_size) { RunCompaction({lvl0_files, lvl1_files}, input_levels, {expected_file1, expected_file2}, snapshots); - } else { - RunCompaction({lvl0_files, lvl1_files}, input_levels, - {expected_file_disable_dynamic_file_size}, snapshots); - } } -TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytesSameKey) { +TEST_F(CompactionJobTest, CutForMaxCompactionBytesSameKey) { // dynamic_file_size option should have no impact on cutting for max // compaction bytes. - bool enable_dyanmic_file_size = GetParam(); - cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; NewDB(); mutable_cf_options_.target_file_size_base = 80; @@ -2132,9 +2067,6 @@ TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytesSameKey) { {expected_file1, expected_file2, expected_file3}, snapshots); } -INSTANTIATE_TEST_CASE_P(CompactionJobDynamicFileSizeTest, - CompactionJobDynamicFileSizeTest, testing::Bool()); - class CompactionJobTimestampTest : public CompactionJobTestBase { public: CompactionJobTimestampTest() diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index eb76cd849a9..f201da56cd0 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -25,11 +25,11 @@ Status CompactionOutputs::Finish( assert(meta != nullptr); Status s = intput_status; if (s.ok()) { - std::string seqno_to_time_mapping_str; - seqno_to_time_mapping.Encode( - seqno_to_time_mapping_str, meta->fd.smallest_seqno, - meta->fd.largest_seqno, meta->file_creation_time); - builder_->SetSeqnoTimeTableProperties(seqno_to_time_mapping_str, + SeqnoToTimeMapping relevant_mapping; + relevant_mapping.CopyFromSeqnoRange( + seqno_to_time_mapping, meta->fd.smallest_seqno, meta->fd.largest_seqno); + relevant_mapping.SetCapacity(kMaxSeqnoTimePairsPerSST); + builder_->SetSeqnoTimeTableProperties(relevant_mapping, meta->oldest_ancester_time); s = builder_->Finish(); @@ -62,12 +62,15 @@ IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status, Statistics* statistics, bool use_fsync) { IOStatus io_s; - if (input_status.ok()) { + IOOptions opts; + io_s = WritableFileWriter::PrepareIOOptions( + WriteOptions(Env::IOActivity::kCompaction), opts); + if (input_status.ok() && io_s.ok()) { StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS); - io_s = file_writer_->Sync(use_fsync); + io_s = file_writer_->Sync(opts, use_fsync); } if (input_status.ok() && io_s.ok()) { - io_s = file_writer_->Close(); + io_s = file_writer_->Close(opts); } if (input_status.ok() && io_s.ok()) { @@ -317,7 +320,6 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { being_grandparent_gap_ ? 2 : 3; if (compaction_->immutable_options()->compaction_style == kCompactionStyleLevel && - compaction_->immutable_options()->level_compaction_dynamic_file_size && num_grandparent_boundaries_crossed >= num_skippable_boundaries_crossed && grandparent_overlapped_bytes_ - previous_overlapped_bytes > @@ -339,7 +341,6 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { // improvement. if (compaction_->immutable_options()->compaction_style == kCompactionStyleLevel && - compaction_->immutable_options()->level_compaction_dynamic_file_size && current_output_file_size_ >= ((compaction_->target_output_file_size() + 99) / 100) * (50 + std::min(grandparent_boundary_switched_num_ * 5, diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h index 18246cf2faa..f232214e3b7 100644 --- a/db/compaction/compaction_outputs.h +++ b/db/compaction/compaction_outputs.h @@ -30,11 +30,9 @@ class CompactionOutputs { // compaction output file struct Output { Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp, - bool _enable_order_check, bool _enable_hash, bool _finished, - uint64_t precalculated_hash) + bool _enable_hash, bool _finished, uint64_t precalculated_hash) : meta(std::move(_meta)), - validator(_icmp, _enable_order_check, _enable_hash, - precalculated_hash), + validator(_icmp, _enable_hash, precalculated_hash), finished(_finished) {} FileMetaData meta; OutputValidator validator; @@ -49,10 +47,10 @@ class CompactionOutputs { // Add generated output to the list void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp, - bool enable_order_check, bool enable_hash, - bool finished = false, uint64_t precalculated_hash = 0) { - outputs_.emplace_back(std::move(meta), icmp, enable_order_check, - enable_hash, finished, precalculated_hash); + bool enable_hash, bool finished = false, + uint64_t precalculated_hash = 0) { + outputs_.emplace_back(std::move(meta), icmp, enable_hash, finished, + precalculated_hash); } // Set new table builder for the current output diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index 4d40ab50341..53ef7bc6df1 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -130,7 +130,7 @@ CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : ioptions_(ioptions), icmp_(icmp) {} -CompactionPicker::~CompactionPicker() {} +CompactionPicker::~CompactionPicker() = default; // Delete this compaction from the list of running compactions. void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { @@ -377,7 +377,8 @@ Compaction* CompactionPicker::CompactFiles( output_level, compact_options.output_file_size_limit, mutable_cf_options.max_compaction_bytes, output_path_id, compression_type, GetCompressionOptions(mutable_cf_options, vstorage, output_level), - Temperature::kUnknown, compact_options.max_subcompactions, + mutable_cf_options.default_write_temperature, + compact_options.max_subcompactions, /* grandparents */ {}, true); RegisterCompaction(c); return c; @@ -456,10 +457,9 @@ bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage, // Returns false if files on parent level are currently in compaction, which // means that we can't compact them bool CompactionPicker::SetupOtherInputs( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, CompactionInputFiles* inputs, - CompactionInputFiles* output_level_inputs, int* parent_index, - int base_index, bool only_expand_towards_right) { + const std::string& cf_name, VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, CompactionInputFiles* output_level_inputs, + int* parent_index, int base_index, bool only_expand_towards_right) { assert(!inputs->empty()); assert(output_level_inputs->empty()); const int input_level = inputs->level; @@ -500,7 +500,6 @@ bool CompactionPicker::SetupOtherInputs( // user key, while excluding other entries for the same user key. This // can happen when one user key spans multiple files. if (!output_level_inputs->empty()) { - const uint64_t limit = mutable_cf_options.max_compaction_bytes; const uint64_t output_level_inputs_size = TotalFileSize(output_level_inputs->files); const uint64_t inputs_size = TotalFileSize(inputs->files); @@ -527,8 +526,6 @@ bool CompactionPicker::SetupOtherInputs( try_overlapping_inputs = false; } if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() && - (mutable_cf_options.ignore_max_compaction_bytes_for_input || - output_level_inputs_size + expanded_inputs_size < limit) && !AreFilesInCompaction(expanded_inputs.files)) { InternalKey new_start, new_limit; GetRange(expanded_inputs, &new_start, &new_limit); @@ -551,8 +548,6 @@ bool CompactionPicker::SetupOtherInputs( base_index, nullptr); expanded_inputs_size = TotalFileSize(expanded_inputs.files); if (expanded_inputs.size() > inputs->size() && - (mutable_cf_options.ignore_max_compaction_bytes_for_input || - output_level_inputs_size + expanded_inputs_size < limit) && !AreFilesInCompaction(expanded_inputs.files)) { expand_inputs = true; } @@ -670,7 +665,8 @@ Compaction* CompactionPicker::CompactRange( compact_range_options.target_path_id, GetCompressionType(vstorage, mutable_cf_options, output_level, 1), GetCompressionOptions(mutable_cf_options, vstorage, output_level), - Temperature::kUnknown, compact_range_options.max_subcompactions, + mutable_cf_options.default_write_temperature, + compact_range_options.max_subcompactions, /* grandparents */ {}, /* is manual */ true, trim_ts, /* score */ -1, /* deletion_compaction */ false, /* l0_files_might_overlap */ true, CompactionReason::kUnknown, @@ -812,8 +808,8 @@ Compaction* CompactionPicker::CompactRange( output_level_inputs.level = output_level; if (input_level != output_level) { int parent_index = -1; - if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs, - &output_level_inputs, &parent_index, -1)) { + if (!SetupOtherInputs(cf_name, vstorage, &inputs, &output_level_inputs, + &parent_index, -1)) { // manual compaction is now multi-threaded, so it can // happen that SetupOtherInputs fails // we handle it higher in RunManualCompaction @@ -858,8 +854,9 @@ Compaction* CompactionPicker::CompactRange( GetCompressionType(vstorage, mutable_cf_options, output_level, vstorage->base_level()), GetCompressionOptions(mutable_cf_options, vstorage, output_level), - Temperature::kUnknown, compact_range_options.max_subcompactions, - std::move(grandparents), /* is manual */ true, trim_ts, /* score */ -1, + mutable_cf_options.default_write_temperature, + compact_range_options.max_subcompactions, std::move(grandparents), + /* is manual */ true, trim_ts, /* score */ -1, /* deletion_compaction */ false, /* l0_files_might_overlap */ true, CompactionReason::kUnknown, compact_range_options.blob_garbage_collection_policy, diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h index 0556e992754..63542a387a7 100644 --- a/db/compaction/compaction_picker.h +++ b/db/compaction/compaction_picker.h @@ -186,7 +186,6 @@ class CompactionPicker { int penultimate_level) const; bool SetupOtherInputs(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, VersionStorageInfo* vstorage, CompactionInputFiles* inputs, CompactionInputFiles* output_level_inputs, @@ -277,8 +276,7 @@ class NullCompactionPicker : public CompactionPicker { } // Always returns false. - virtual bool NeedsCompaction( - const VersionStorageInfo* /*vstorage*/) const override { + bool NeedsCompaction(const VersionStorageInfo* /*vstorage*/) const override { return false; } }; diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc index 50529777028..d898b5126de 100644 --- a/db/compaction/compaction_picker_fifo.cc +++ b/db/compaction/compaction_picker_fifo.cc @@ -116,7 +116,8 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction( Compaction* c = new Compaction( vstorage, ioptions_, mutable_cf_options, mutable_db_options, std::move(inputs), 0, 0, 0, 0, kNoCompression, - mutable_cf_options.compression_opts, Temperature::kUnknown, + mutable_cf_options.compression_opts, + mutable_cf_options.default_write_temperature, /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0), /* is deletion compaction */ true, /* l0_files_might_overlap */ true, @@ -185,7 +186,8 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */, 0 /* max compaction bytes, not applicable */, 0 /* output path ID */, mutable_cf_options.compression, - mutable_cf_options.compression_opts, Temperature::kUnknown, + mutable_cf_options.compression_opts, + mutable_cf_options.default_write_temperature, 0 /* max_subcompactions */, {}, /* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0), /* is deletion compaction */ false, @@ -280,7 +282,8 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( /* target_file_size */ 0, /* max_compaction_bytes */ 0, /* output_path_id */ 0, kNoCompression, - mutable_cf_options.compression_opts, Temperature::kUnknown, + mutable_cf_options.compression_opts, + mutable_cf_options.default_write_temperature, /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0), /* is deletion compaction */ true, @@ -414,6 +417,7 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction( if (inputs[0].files.empty()) { return nullptr; } + assert(compaction_target_temp != Temperature::kLastTemperature); Compaction* c = new Compaction( vstorage, ioptions_, mutable_cf_options, mutable_db_options, diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h index df21a1bde0f..5badc491c2f 100644 --- a/db/compaction/compaction_picker_fifo.h +++ b/db/compaction/compaction_picker_fifo.h @@ -18,26 +18,27 @@ class FIFOCompactionPicker : public CompactionPicker { const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, - VersionStorageInfo* version, - LogBuffer* log_buffer) override; - - virtual Compaction* CompactRange( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - int input_level, int output_level, - const CompactRangeOptions& compact_range_options, - const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end, bool* manual_conflict, - uint64_t max_file_num_to_ignore, const std::string& trim_ts) override; + Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer) override; + + Compaction* CompactRange(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, int input_level, + int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, + const std::string& trim_ts) override; // The maximum allowed output level. Always returns 0. - virtual int MaxOutputLevel() const override { return 0; } + int MaxOutputLevel() const override { return 0; } - virtual bool NeedsCompaction( - const VersionStorageInfo* vstorage) const override; + bool NeedsCompaction(const VersionStorageInfo* vstorage) const override; private: Compaction* PickTTLCompaction(const std::string& cf_name, diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index c436689bb65..3cb45211298 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -45,6 +45,13 @@ bool LevelCompactionPicker::NeedsCompaction( } namespace { + +enum class CompactToNextLevel { + kNo, // compact to the same level as the input file + kYes, // compact to the next level except the last level to the same level + kSkipLastLevel, // compact to the next level but skip the last level +}; + // A class to build a leveled compaction step-by-step. class LevelCompactionBuilder { public: @@ -106,6 +113,15 @@ class LevelCompactionBuilder { // otherwise, returns false. bool PickIntraL0Compaction(); + // When total L0 size is small compared to Lbase, try to pick intra-L0 + // compaction starting from the newest L0 file. This helps to prevent + // L0->Lbase compaction with large write-amp. + // + // Returns true iff an intra-L0 compaction is picked. + // `start_level_inputs_` and `output_level_` will be updated accordingly if + // a compaction is picked. + bool PickSizeBasedIntraL0Compaction(); + // Return true if TrivialMove is extended. `start_index` is the index of // the initial file picked, which should already be in `start_level_inputs_`. bool TryExtendNonL0TrivialMove(int start_index, @@ -115,9 +131,10 @@ class LevelCompactionBuilder { // level_files is a vector of (level, file metadata) in ascending order of // level. If compact_to_next_level is true, compact the file to the next // level, otherwise, compact to the same level as the input file. + // If skip_last_level is true, skip the last level. void PickFileToCompact( const autovector>& level_files, - bool compact_to_next_level); + CompactToNextLevel compact_to_next_level); const std::string& cf_name_; VersionStorageInfo* vstorage_; @@ -149,20 +166,24 @@ class LevelCompactionBuilder { void LevelCompactionBuilder::PickFileToCompact( const autovector>& level_files, - bool compact_to_next_level) { + CompactToNextLevel compact_to_next_level) { for (auto& level_file : level_files) { // If it's being compacted it has nothing to do here. // If this assert() fails that means that some function marked some // files as being_compacted, but didn't call ComputeCompactionScore() assert(!level_file.second->being_compacted); start_level_ = level_file.first; - if ((compact_to_next_level && + if ((compact_to_next_level == CompactToNextLevel::kSkipLastLevel && start_level_ == vstorage_->num_non_empty_levels() - 1) || (start_level_ == 0 && !compaction_picker_->level0_compactions_in_progress()->empty())) { continue; } - if (compact_to_next_level) { + + // Compact to the next level only if the file is not in the last level and + // compact_to_next_level is kYes or kSkipLastLevel. + if (compact_to_next_level != CompactToNextLevel::kNo && + (start_level_ < vstorage_->num_non_empty_levels() - 1)) { output_level_ = (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; } else { @@ -248,7 +269,8 @@ void LevelCompactionBuilder::SetupInitialFiles() { } // Bottommost Files Compaction on deleting tombstones - PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(), false); + PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(), + CompactToNextLevel::kNo); if (!start_level_inputs_.empty()) { compaction_reason_ = CompactionReason::kBottommostFiles; return; @@ -274,21 +296,26 @@ void LevelCompactionBuilder::SetupInitialFiles() { } } - PickFileToCompact(vstorage_->ExpiredTtlFiles(), true); + PickFileToCompact(vstorage_->ExpiredTtlFiles(), + CompactToNextLevel::kSkipLastLevel); if (!start_level_inputs_.empty()) { compaction_reason_ = CompactionReason::kTtl; return; } // Periodic Compaction - PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(), false); + PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(), + ioptions_.level_compaction_dynamic_level_bytes + ? CompactToNextLevel::kYes + : CompactToNextLevel::kNo); if (!start_level_inputs_.empty()) { compaction_reason_ = CompactionReason::kPeriodicCompaction; return; } // Forced blob garbage collection - PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false); + PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), + CompactToNextLevel::kNo); if (!start_level_inputs_.empty()) { compaction_reason_ = CompactionReason::kForcedBlobGC; return; @@ -328,7 +355,9 @@ void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() { TEST_SYNC_POINT("LevelCompactionPicker::RoundRobin"); // Only expand the inputs when we have selected a file in start_level_inputs_ - if (start_level_inputs_.size() == 0) return; + if (start_level_inputs_.size() == 0) { + return; + } uint64_t start_lvl_bytes_no_compacting = 0; uint64_t curr_bytes_to_compact = 0; @@ -438,9 +467,8 @@ bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { } if (!is_l0_trivial_move_ && !compaction_picker_->SetupOtherInputs( - cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_, - &output_level_inputs_, &parent_index_, base_index_, - round_robin_expanding)) { + cf_name_, vstorage_, &start_level_inputs_, &output_level_inputs_, + &parent_index_, base_index_, round_robin_expanding)) { return false; } @@ -524,7 +552,7 @@ Compaction* LevelCompactionBuilder::GetCompaction() { GetCompressionType(vstorage_, mutable_cf_options_, output_level_, vstorage_->base_level()), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_), - Temperature::kUnknown, + mutable_cf_options_.default_write_temperature, /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, /* trim_ts */ "", start_level_score_, false /* deletion_compaction */, l0_files_might_overlap, compaction_reason_); @@ -758,6 +786,9 @@ bool LevelCompactionBuilder::PickFileToCompact() { // being compacted at level 0. if (start_level_ == 0 && !compaction_picker_->level0_compactions_in_progress()->empty()) { + if (PickSizeBasedIntraL0Compaction()) { + return true; + } TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0"); return false; } @@ -770,6 +801,9 @@ bool LevelCompactionBuilder::PickFileToCompact() { if (TryPickL0TrivialMove()) { return true; } + if (start_level_ == 0 && PickSizeBasedIntraL0Compaction()) { + return true; + } const std::vector& level_files = vstorage_->LevelFiles(start_level_); @@ -874,6 +908,56 @@ bool LevelCompactionBuilder::PickIntraL0Compaction() { mutable_cf_options_.max_compaction_bytes, &start_level_inputs_); } + +bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() { + assert(start_level_ == 0); + int base_level = vstorage_->base_level(); + if (base_level <= 0) { + return false; + } + const std::vector& l0_files = + vstorage_->LevelFiles(/*level=*/0); + size_t min_num_file = + std::max(2, mutable_cf_options_.level0_file_num_compaction_trigger); + if (l0_files.size() < min_num_file) { + return false; + } + uint64_t l0_size = 0; + for (const auto& file : l0_files) { + l0_size += file->fd.GetFileSize(); + } + const uint64_t min_lbase_size = + l0_size * static_cast(std::max( + 10.0, mutable_cf_options_.max_bytes_for_level_multiplier)); + assert(min_lbase_size >= l0_size); + const std::vector& lbase_files = + vstorage_->LevelFiles(/*level=*/base_level); + uint64_t lbase_size = 0; + for (const auto& file : lbase_files) { + lbase_size += file->fd.GetFileSize(); + if (lbase_size > min_lbase_size) { + break; + } + } + if (lbase_size <= min_lbase_size) { + return false; + } + + start_level_inputs_.clear(); + start_level_inputs_.level = 0; + for (const auto& file : l0_files) { + if (file->being_compacted) { + break; + } + start_level_inputs_.files.push_back(file); + } + if (start_level_inputs_.files.size() < min_num_file) { + start_level_inputs_.clear(); + return false; + } + output_level_ = 0; + return true /* picked an intra-L0 compaction */; +} } // namespace Compaction* LevelCompactionPicker::PickCompaction( diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h index 6eb0f586f4d..e822e3396cb 100644 --- a/db/compaction/compaction_picker_level.h +++ b/db/compaction/compaction_picker_level.h @@ -20,14 +20,13 @@ class LevelCompactionPicker : public CompactionPicker { LevelCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, - VersionStorageInfo* vstorage, - LogBuffer* log_buffer) override; + Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) override; - virtual bool NeedsCompaction( - const VersionStorageInfo* vstorage) const override; + bool NeedsCompaction(const VersionStorageInfo* vstorage) const override; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index 3241d034d31..beac419d98e 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -77,7 +77,7 @@ class CompactionPickerTestBase : public testing::Test { ioptions_.level_compaction_dynamic_level_bytes = false; } - ~CompactionPickerTestBase() override {} + ~CompactionPickerTestBase() override = default; void NewVersionStorage(int num_levels, CompactionStyle style) { DeleteVersionStorage(); @@ -214,7 +214,7 @@ class CompactionPickerTest : public CompactionPickerTestBase { explicit CompactionPickerTest() : CompactionPickerTestBase(BytewiseComparator()) {} - ~CompactionPickerTest() override {} + ~CompactionPickerTest() override = default; }; class CompactionPickerU64TsTest : public CompactionPickerTestBase { @@ -222,7 +222,7 @@ class CompactionPickerU64TsTest : public CompactionPickerTestBase { explicit CompactionPickerU64TsTest() : CompactionPickerTestBase(test::BytewiseComparatorWithU64TsWrapper()) {} - ~CompactionPickerU64TsTest() override {} + ~CompactionPickerU64TsTest() override = default; }; TEST_F(CompactionPickerTest, Empty) { @@ -1443,15 +1443,12 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) { ioptions_.compaction_pri = kMinOverlappingRatio; mutable_cf_options_.max_bytes_for_level_base = 10000000; mutable_cf_options_.max_bytes_for_level_multiplier = 10; - mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; - // file 7 and 8 over lap with the same file, but file 8 is smaller so - // it will be picked. - // Overlaps with file 26, 27. And the file is compensated so will be - // picked up. + // Overlaps with file 26, 27, ratio is + // (60000000U + 60000000U / 180000000U) = 0.67 Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U); - Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27 - Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28 + Add(2, 7U, "178", "189", 60000000U); // Overlaps with file 28 + Add(2, 8U, "401", "500", 61000000U); // Overlaps with file 29 Add(3, 26U, "160", "165", 60000000U); // Boosted file size in output level is not considered. @@ -1465,7 +1462,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) { &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); - // Picking file 8 because overlapping ratio is the biggest. + // Picking file 6 because overlapping ratio is the biggest. ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); } @@ -2468,46 +2465,23 @@ TEST_F(CompactionPickerTest, IsBottommostLevelTest) { DeleteVersionStorage(); } -TEST_F(CompactionPickerTest, MaxCompactionBytesHit) { +TEST_F(CompactionPickerTest, IgnoreCompactionLimitWhenAddFileFromInputLevel) { mutable_cf_options_.max_bytes_for_level_base = 1000000u; mutable_cf_options_.max_compaction_bytes = 800000u; - mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; - ioptions_.level_compaction_dynamic_level_bytes = false; - NewVersionStorage(6, kCompactionStyleLevel); - // A compaction should be triggered and pick file 2 and 5. - // It can expand because adding file 1 and 3, the compaction size will - // exceed mutable_cf_options_.max_bytes_for_level_base. - Add(1, 1U, "100", "150", 300000U); - Add(1, 2U, "151", "200", 300001U, 0, 0); - Add(1, 3U, "201", "250", 300000U, 0, 0); - Add(1, 4U, "251", "300", 300000U, 0, 0); - Add(2, 5U, "100", "256", 1U); - UpdateVersionStorageInfo(); - - std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); - ASSERT_TRUE(compaction.get() != nullptr); - ASSERT_EQ(2U, compaction->num_input_levels()); - ASSERT_EQ(1U, compaction->num_input_files(0)); - ASSERT_EQ(1U, compaction->num_input_files(1)); - ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); - ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); -} - -TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) { - mutable_cf_options_.max_bytes_for_level_base = 800000u; - mutable_cf_options_.max_compaction_bytes = 1000000u; - mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; ioptions_.level_compaction_dynamic_level_bytes = false; NewVersionStorage(6, kCompactionStyleLevel); // A compaction should be triggered and pick file 2 and 5. - // and it expands to file 1 and 3 too. + // It pulls in other compaction input file from the input level L1 + // without pulling in more output level files. + // Files 1, 3, 4 will be included in the compaction. + // File 6 is excluded since it overlaps with file 7. Add(1, 1U, "100", "150", 300000U); Add(1, 2U, "151", "200", 300001U, 0, 0); Add(1, 3U, "201", "250", 300000U, 0, 0); Add(1, 4U, "251", "300", 300000U, 0, 0); - Add(2, 5U, "000", "251", 1U); + Add(1, 6U, "325", "400", 300000U, 0, 0); + Add(2, 5U, "100", "350", 1U); + Add(2, 7U, "375", "425", 1U); UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( @@ -2515,11 +2489,12 @@ TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) { &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); - ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_EQ(4U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->num_input_files(1)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber()); ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); } @@ -4149,9 +4124,177 @@ TEST_P(PerKeyPlacementCompactionPickerTest, } } +TEST_F(CompactionPickerTest, + LevelCompactionPrioritizeFilesMarkedForCompaction1) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + // L5 + // F4 [100, 200] size:100, F5 [300, 400] size:100, F6 [500, 600] size:200 + // F5 is marked for compaction + // L6 + // F1 [100, 200] size:100, F2 [300, 400] size:1000, F3 [500, 600] size:100 + // + // First compaction should pick F5 since it's marked for compaction. + // Second compaction should pick F6 since it has min overlap ratio. + Add(/*level=*/num_levels - 1, /*file_number=*/1U, /*smallest=*/"100", + /*largest=*/"200", + /*file_size=*/100, /*path_id=*/0, /*smallest_seq=*/0, + /*largest_seq=*/0, /*compensated_file_size=*/100, + /*marked_for_compact=*/false); + Add(/*level=*/num_levels - 1, /*file_number=*/2U, /*smallest=*/"300", + /*largest=*/"400", + /*file_size=*/1000, /*path_id=*/0, /*smallest_seq=*/0, + /*largest_seq=*/0, /*compensated_file_size=*/1000, + /*marked_for_compact=*/false); + Add(/*level=*/num_levels - 1, /*file_number=*/3U, /*smallest=*/"500", + /*largest=*/"600", + /*file_size=*/100, /*path_id=*/0, /*smallest_seq=*/0, + /*largest_seq=*/0, /*compensated_file_size=*/100, + /*marked_for_compact=*/false); + + Add(/*level=*/num_levels - 2, /*file_number=*/4U, /*smallest=*/"100", + /*largest=*/"200", + /*file_size=*/100, /*path_id=*/0, /*smallest_seq=*/100, + /*largest_seq=*/200, /*compensated_file_size=*/100, + /*marked_for_compact=*/false); + // Marked for compaction, but with a larger overlap ratio. + Add(/*level=*/num_levels - 2, /*file_number=*/5U, /*smallest=*/"300", + /*largest=*/"400", + /*file_size=*/100, /*path_id=*/0, /*smallest_seq=*/300, + /*largest_seq=*/400, /*compensated_file_size=*/100, + /*marked_for_compact=*/true); + Add(/*level=*/num_levels - 2, /*file_number=*/6U, /*smallest=*/"500", + /*largest=*/"600", + /*file_size=*/200, /*path_id=*/0, /*smallest_seq=*/400, + /*largest_seq=*/500, /*compensated_file_size=*/200, + /*marked_for_compact=*/false); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(num_levels - 2, compaction->start_level()); + ASSERT_EQ(num_levels - 1, compaction->output_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + + std::unique_ptr second_compaction( + level_compaction_picker.PickCompaction(cf_name_, mutable_cf_options_, + mutable_db_options_, + vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(second_compaction); + ASSERT_EQ(num_levels - 1, compaction->output_level()); + ASSERT_EQ(num_levels - 2, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(6U, second_compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, + LevelCompactionPrioritizeFilesMarkedForCompaction2) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + // L4 + // F3 [100, 200] size:2000 + // L5 + // F2 [100, 200] size:2000, marked for compaction + // L6 + // F1 [100, 200] size: 20000 + // + // L4 should be prioritized over L5 since L4 has a higher compaction score. + // Files marked for compaction do not affect level picking order. + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(/*level=*/num_levels - 1, /*file_number=*/1U, /*smallest=*/"100", + /*largest=*/"200", + /*file_size=*/20000, /*path_id=*/0, /*smallest_seq=*/0, + /*largest_seq=*/0, /*compensated_file_size=*/100, + /*marked_for_compact=*/false); + // Level score should be 1. + Add(/*level=*/num_levels - 2, /*file_number=*/2U, /*smallest=*/"100", + /*largest=*/"200", + /*file_size=*/2000, /*path_id=*/0, /*smallest_seq=*/100, + /*largest_seq=*/200, /*compensated_file_size=*/2000, + /*marked_for_compact=*/true); + // Level score should be larger than L5. + Add(/*level=*/num_levels - 3, /*file_number=*/3U, /*smallest=*/"100", + /*largest=*/"200", + /*file_size=*/2000, /*path_id=*/0, /*smallest_seq=*/300, + /*largest_seq=*/400, /*compensated_file_size=*/2000, + /*marked_for_compact=*/false); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(num_levels - 3, compaction->start_level()); + ASSERT_EQ(num_levels - 2, compaction->output_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); +} + INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest, PerKeyPlacementCompactionPickerTest, ::testing::Bool()); +TEST_F(CompactionPickerTest, IntraL0WhenL0IsSmall) { + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + for (const uint64_t lbase_size_multiplier : {1, 10, 11, 40}) { + SCOPED_TRACE("lbase_size_multiplier=" + + std::to_string(lbase_size_multiplier)); + NewVersionStorage(6, kCompactionStyleLevel); + // When L0 size is <= Lbase size / max_bytes_for_level_multiplier, + // intra-L0 compaction is picked. Otherwise, L0->L1 + // compaction is picked. + Add(/*level=*/0, /*file_number=*/1U, /*smallest=*/"100", + /*largest=*/"200", /*file_size=*/1000, /*path_id=*/0, + /*smallest_seq=*/10, /*largest_seq=*/11, + /*compensated_file_size=*/1000); + Add(/*level=*/0, /*file_number=*/2U, /*smallest=*/"100", + /*largest=*/"100", /*file_size=*/1000, /*path_id=*/0, + /*smallest_seq=*/20, /*largest_seq=*/21, + /*compensated_file_size=*/1000); + Add(/*level=*/0, /*file_number=*/3U, /*smallest=*/"100", + /*largest=*/"200", /*file_size=*/1000, /*path_id=*/0, + /*smallest_seq=*/30, /*largest_seq=*/31, + /*compensated_file_size=*/1000); + Add(/*level=*/0, /*file_number=*/4U, /*smallest=*/"100", + /*largest=*/"200", /*file_size=*/1000, /*path_id=*/0, + /*smallest_seq=*/40, /*largest_seq=*/41, + /*compensated_file_size=*/1000); + const uint64_t l0_size = 4000; + const uint64_t lbase_size = l0_size * lbase_size_multiplier; + Add(/*level=*/1, /*file_number=*/5U, /*smallest=*/"100", + /*largest=*/"200", /*file_size=*/lbase_size, /*path_id=*/0, + /*smallest_seq=*/0, /*largest_seq=*/0, + /*compensated_file_size=*/lbase_size); + UpdateVersionStorageInfo(); + + LevelCompactionPicker compaction_picker(ioptions_, &icmp_); + std::unique_ptr compaction(compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + if (lbase_size_multiplier > + mutable_cf_options_.max_bytes_for_level_multiplier) { + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(0, compaction->output_level()); + } else { + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1, compaction->output_level()); + } + } +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc index 6d9ff43cd54..597edb7919a 100644 --- a/db/compaction/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -870,7 +870,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( output_level, 1, enable_compression), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, enable_compression), - Temperature::kUnknown, + mutable_cf_options_.default_write_temperature, /* max_subcompactions */ 0, grandparents, /* is manual */ false, /* trim_ts */ "", score_, false /* deletion_compaction */, @@ -1139,8 +1139,7 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp( // from bottom_start_idx and bottom_end_idx, but for now, we use // SetupOtherInputs() for simplicity. int parent_index = -1; // Create and use bottom_start_idx? - if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, - &second_last_level_inputs, + if (!picker_->SetupOtherInputs(cf_name_, vstorage_, &second_last_level_inputs, &bottom_level_inputs, &parent_index, /*base_index=*/-1)) { return nullptr; @@ -1205,7 +1204,7 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp( true /* enable_compression */), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, true /* enable_compression */), - Temperature::kUnknown, + mutable_cf_options_.default_write_temperature, /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, /* trim_ts */ "", score_, false /* deletion_compaction */, /* l0_files_might_overlap */ true, @@ -1311,9 +1310,8 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { int parent_index = -1; output_level_inputs.level = output_level; - if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, - &start_level_inputs, &output_level_inputs, - &parent_index, -1)) { + if (!picker_->SetupOtherInputs(cf_name_, vstorage_, &start_level_inputs, + &output_level_inputs, &parent_index, -1)) { return nullptr; } inputs.push_back(start_level_inputs); @@ -1349,7 +1347,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id, GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level), - Temperature::kUnknown, + mutable_cf_options_.default_write_temperature, /* max_subcompactions */ 0, grandparents, /* is manual */ false, /* trim_ts */ "", score_, false /* deletion_compaction */, /* l0_files_might_overlap */ true, @@ -1442,7 +1440,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange( true /* enable_compression */), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, true /* enable_compression */), - Temperature::kUnknown, + mutable_cf_options_.default_write_temperature, /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, /* trim_ts */ "", score_, false /* deletion_compaction */, /* l0_files_might_overlap */ true, compaction_reason); diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h index cb160596929..b6103088fbd 100644 --- a/db/compaction/compaction_picker_universal.h +++ b/db/compaction/compaction_picker_universal.h @@ -17,14 +17,13 @@ class UniversalCompactionPicker : public CompactionPicker { UniversalCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, - VersionStorageInfo* vstorage, - LogBuffer* log_buffer) override; - virtual int MaxOutputLevel() const override { return NumberLevels() - 1; } + Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) override; + int MaxOutputLevel() const override { return NumberLevels() - 1; } - virtual bool NeedsCompaction( - const VersionStorageInfo* vstorage) const override; + bool NeedsCompaction(const VersionStorageInfo* vstorage) const override; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc index 3149bb50025..2411c27aac3 100644 --- a/db/compaction/compaction_service_job.cc +++ b/db/compaction/compaction_service_job.cc @@ -74,24 +74,24 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( compaction_input.output_level, input_files_oss.str().c_str()); CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_, GetCompactionId(sub_compact), thread_pri_); - CompactionServiceJobStatus compaction_status = - db_options_.compaction_service->StartV2(info, compaction_input_binary); - switch (compaction_status) { + CompactionServiceScheduleResponse response = + db_options_.compaction_service->Schedule(info, compaction_input_binary); + switch (response.status) { case CompactionServiceJobStatus::kSuccess: break; case CompactionServiceJobStatus::kFailure: sub_compact->status = Status::Incomplete( - "CompactionService failed to start compaction job."); + "CompactionService failed to schedule a remote compaction job."); ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Remote compaction failed to start.", compaction_input.column_family.name.c_str(), job_id_); - return compaction_status; + return response.status; case CompactionServiceJobStatus::kUseLocal: ROCKS_LOG_INFO( db_options_.info_log, - "[%s] [JOB %d] Remote compaction fallback to local by API Start.", + "[%s] [JOB %d] Remote compaction fallback to local by API (Schedule)", compaction_input.column_family.name.c_str(), job_id_); - return compaction_status; + return response.status; default: assert(false); // unknown status break; @@ -101,14 +101,15 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( "[%s] [JOB %d] Waiting for remote compaction...", compaction_input.column_family.name.c_str(), job_id_); std::string compaction_result_binary; - compaction_status = db_options_.compaction_service->WaitForCompleteV2( - info, &compaction_result_binary); + CompactionServiceJobStatus compaction_status = + db_options_.compaction_service->Wait(response.scheduled_job_id, + &compaction_result_binary); if (compaction_status == CompactionServiceJobStatus::kUseLocal) { - ROCKS_LOG_INFO(db_options_.info_log, - "[%s] [JOB %d] Remote compaction fallback to local by API " - "WaitForComplete.", - compaction_input.column_family.name.c_str(), job_id_); + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Remote compaction fallback to local by API (Wait)", + compaction_input.column_family.name.c_str(), job_id_); return compaction_status; } @@ -195,8 +196,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( auto cfd = compaction->column_family_data(); sub_compact->Current().AddOutput(std::move(meta), - cfd->internal_comparator(), false, false, - true, file.paranoid_hash); + cfd->internal_comparator(), false, true, + file.paranoid_hash); } sub_compact->compaction_job_stats = compaction_result.stats; sub_compact->Current().SetNumOutputRecords( @@ -830,4 +831,3 @@ bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other, } #endif // NDEBUG } // namespace ROCKSDB_NAMESPACE - diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc index 7c87f88d1be..812a658dcff 100644 --- a/db/compaction/compaction_service_test.cc +++ b/db/compaction/compaction_service_test.cc @@ -31,40 +31,44 @@ class MyTestCompactionService : public CompactionService { const char* Name() const override { return kClassName(); } - CompactionServiceJobStatus StartV2( + CompactionServiceScheduleResponse Schedule( const CompactionServiceJobInfo& info, const std::string& compaction_service_input) override { InstrumentedMutexLock l(&mutex_); start_info_ = info; assert(info.db_name == db_path_); - jobs_.emplace(info.job_id, compaction_service_input); - CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess; - if (is_override_start_status_) { - return override_start_status_; - } - return s; - } - - CompactionServiceJobStatus WaitForCompleteV2( - const CompactionServiceJobInfo& info, - std::string* compaction_service_result) override { + std::string unique_id = Env::Default()->GenerateUniqueId(); + jobs_.emplace(unique_id, compaction_service_input); + infos_.emplace(unique_id, info); + CompactionServiceScheduleResponse response( + unique_id, is_override_start_status_ + ? override_start_status_ + : CompactionServiceJobStatus::kSuccess); + return response; + } + + CompactionServiceJobStatus Wait(const std::string& scheduled_job_id, + std::string* result) override { std::string compaction_input; - assert(info.db_name == db_path_); { InstrumentedMutexLock l(&mutex_); - wait_info_ = info; - auto i = jobs_.find(info.job_id); - if (i == jobs_.end()) { + auto job_index = jobs_.find(scheduled_job_id); + if (job_index == jobs_.end()) { return CompactionServiceJobStatus::kFailure; } - compaction_input = std::move(i->second); - jobs_.erase(i); - } + compaction_input = std::move(job_index->second); + jobs_.erase(job_index); + auto info_index = infos_.find(scheduled_job_id); + if (info_index == infos_.end()) { + return CompactionServiceJobStatus::kFailure; + } + wait_info_ = std::move(info_index->second); + infos_.erase(info_index); + } if (is_override_wait_status_) { return override_wait_status_; } - CompactionServiceOptionsOverride options_override; options_override.env = options_.env; options_override.file_checksum_gen_factory = @@ -90,11 +94,11 @@ class MyTestCompactionService : public CompactionService { OpenAndCompactOptions options; options.canceled = &canceled_; - Status s = DB::OpenAndCompact( - options, db_path_, db_path_ + "/" + std::to_string(info.job_id), - compaction_input, compaction_service_result, options_override); + Status s = + DB::OpenAndCompact(options, db_path_, db_path_ + "/" + scheduled_job_id, + compaction_input, result, options_override); if (is_override_wait_result_) { - *compaction_service_result = override_wait_result_; + *result = override_wait_result_; } compaction_num_.fetch_add(1); if (s.ok()) { @@ -135,7 +139,8 @@ class MyTestCompactionService : public CompactionService { private: InstrumentedMutex mutex_; std::atomic_int compaction_num_{0}; - std::map jobs_; + std::map jobs_; + std::map infos_; const std::string db_path_; Options options_; std::shared_ptr statistics_; @@ -172,6 +177,7 @@ class CompactionServiceTest : public DBTestBase { remote_table_properties_collector_factories); options->compaction_service = compaction_service_; DestroyAndReopen(*options); + CreateAndReopenWithCF({"cf_1", "cf_2", "cf_3"}, *options); } Statistics* GetCompactorStatistics() { return compactor_statistics_.get(); } @@ -183,36 +189,45 @@ class CompactionServiceTest : public DBTestBase { return static_cast_with_check(cs); } - void GenerateTestData() { - // Generate 20 files @ L2 - for (int i = 0; i < 20; i++) { - for (int j = 0; j < 10; j++) { - int key_id = i * 10 + j; - ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + void GenerateTestData(bool move_files_manually = false) { + // Generate 20 files @ L2 Per CF + for (int cf_id = 0; cf_id < static_cast(handles_.size()); cf_id++) { + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(cf_id, Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush(cf_id)); + } + if (move_files_manually) { + MoveFilesToLevel(2, cf_id); } - ASSERT_OK(Flush()); - } - MoveFilesToLevel(2); - // Generate 10 files @ L1 overlap with all 20 files @ L2 - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 10; j++) { - int key_id = i * 20 + j * 2; - ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + // Generate 10 files @ L1 overlap with all 20 files @ L2 + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK( + Put(cf_id, Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush(cf_id)); + } + if (move_files_manually) { + MoveFilesToLevel(1, cf_id); + ASSERT_EQ(FilesPerLevel(cf_id), "0,10,20"); } - ASSERT_OK(Flush()); } - MoveFilesToLevel(1); - ASSERT_EQ(FilesPerLevel(), "0,10,20"); } void VerifyTestData() { - for (int i = 0; i < 200; i++) { - auto result = Get(Key(i)); - if (i % 2) { - ASSERT_EQ(result, "value" + std::to_string(i)); - } else { - ASSERT_EQ(result, "value_new" + std::to_string(i)); + for (int cf_id = 0; cf_id < static_cast(handles_.size()); cf_id++) { + for (int i = 0; i < 200; i++) { + auto result = Get(cf_id, Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } } } } @@ -234,32 +249,10 @@ TEST_F(CompactionServiceTest, BasicCompactions) { Statistics* primary_statistics = GetPrimaryStatistics(); Statistics* compactor_statistics = GetCompactorStatistics(); - for (int i = 0; i < 20; i++) { - for (int j = 0; j < 10; j++) { - int key_id = i * 10 + j; - ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); - } - ASSERT_OK(Flush()); - } - - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 10; j++) { - int key_id = i * 20 + j * 2; - ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); - } - ASSERT_OK(Flush()); - } + GenerateTestData(); ASSERT_OK(dbfull()->TEST_WaitForCompact()); + VerifyTestData(); - // verify result - for (int i = 0; i < 200; i++) { - auto result = Get(Key(i)); - if (i % 2) { - ASSERT_EQ(result, "value" + std::to_string(i)); - } else { - ASSERT_EQ(result, "value_new" + std::to_string(i)); - } - } auto my_cs = GetCompactionService(); ASSERT_GE(my_cs->GetCompactionNum(), 1); @@ -322,7 +315,8 @@ TEST_F(CompactionServiceTest, BasicCompactions) { assert(*id != kNullUniqueId64x2); verify_passed++; }); - Reopen(options); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "cf_1", "cf_2", "cf_3"}, + options); ASSERT_GT(verify_passed, 0); Close(); } @@ -490,26 +484,9 @@ TEST_F(CompactionServiceTest, CompactionFilter) { new PartialDeleteCompactionFilter()); options.compaction_filter = delete_comp_filter.get(); ReopenWithCompactionService(&options); - - for (int i = 0; i < 20; i++) { - for (int j = 0; j < 10; j++) { - int key_id = i * 10 + j; - ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); - } - ASSERT_OK(Flush()); - } - - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 10; j++) { - int key_id = i * 20 + j * 2; - ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); - } - ASSERT_OK(Flush()); - } + GenerateTestData(); ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - // verify result for (int i = 0; i < 200; i++) { auto result = Get(Key(i)); @@ -551,17 +528,17 @@ TEST_F(CompactionServiceTest, ConcurrentCompaction) { options.level0_file_num_compaction_trigger = 100; options.max_background_jobs = 20; ReopenWithCompactionService(&options); - GenerateTestData(); + GenerateTestData(true); ColumnFamilyMetaData meta; db_->GetColumnFamilyMetaData(&meta); std::vector threads; for (const auto& file : meta.levels[1].files) { - threads.emplace_back(std::thread([&]() { + threads.emplace_back([&]() { std::string fname = file.db_path + "/" + file.name; ASSERT_OK(db_->CompactFiles(CompactionOptions(), {fname}, 2)); - })); + }); } for (auto& thread : threads) { @@ -570,14 +547,7 @@ TEST_F(CompactionServiceTest, ConcurrentCompaction) { ASSERT_OK(dbfull()->TEST_WaitForCompact()); // verify result - for (int i = 0; i < 200; i++) { - auto result = Get(Key(i)); - if (i % 2) { - ASSERT_EQ(result, "value" + std::to_string(i)); - } else { - ASSERT_EQ(result, "value_new" + std::to_string(i)); - } - } + VerifyTestData(); auto my_cs = GetCompactionService(); ASSERT_EQ(my_cs->GetCompactionNum(), 10); ASSERT_EQ(FilesPerLevel(), "0,0,10"); @@ -587,21 +557,7 @@ TEST_F(CompactionServiceTest, CompactionInfo) { Options options = CurrentOptions(); ReopenWithCompactionService(&options); - for (int i = 0; i < 20; i++) { - for (int j = 0; j < 10; j++) { - int key_id = i * 10 + j; - ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); - } - ASSERT_OK(Flush()); - } - - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 10; j++) { - int key_id = i * 20 + j * 2; - ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); - } - ASSERT_OK(Flush()); - } + GenerateTestData(); ASSERT_OK(dbfull()->TEST_WaitForCompact()); auto my_cs = static_cast_with_check(GetCompactionService()); @@ -676,32 +632,9 @@ TEST_F(CompactionServiceTest, FallbackLocalAuto) { my_cs->OverrideStartStatus(CompactionServiceJobStatus::kUseLocal); - for (int i = 0; i < 20; i++) { - for (int j = 0; j < 10; j++) { - int key_id = i * 10 + j; - ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); - } - ASSERT_OK(Flush()); - } - - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 10; j++) { - int key_id = i * 20 + j * 2; - ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); - } - ASSERT_OK(Flush()); - } + GenerateTestData(); ASSERT_OK(dbfull()->TEST_WaitForCompact()); - - // verify result - for (int i = 0; i < 200; i++) { - auto result = Get(Key(i)); - if (i % 2) { - ASSERT_EQ(result, "value" + std::to_string(i)); - } else { - ASSERT_EQ(result, "value_new" + std::to_string(i)); - } - } + VerifyTestData(); ASSERT_EQ(my_cs->GetCompactionNum(), 0); diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc index 779b980d825..8e72dce9d03 100644 --- a/db/compaction/tiered_compaction_test.cc +++ b/db/compaction/tiered_compaction_test.cc @@ -14,12 +14,11 @@ #include "rocksdb/listener.h" #include "rocksdb/utilities/debug.h" #include "test_util/mock_time_env.h" +#include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { - -class TieredCompactionTest : public DBTestBase, - public testing::WithParamInterface { +class TieredCompactionTest : public DBTestBase { public: TieredCompactionTest() : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true), @@ -123,14 +122,8 @@ class TieredCompactionTest : public DBTestBase, pl_stats.Clear(); } - // bottommost_temperature is renaming to last_level_temperature, set either - // of them should have the same effect. void SetColdTemperature(Options& options) { - if (GetParam()) { - options.bottommost_temperature = Temperature::kCold; - } else { - options.last_level_temperature = Temperature::kCold; - } + options.last_level_temperature = Temperature::kCold; } private: @@ -172,7 +165,7 @@ class TieredCompactionTest : public DBTestBase, } }; -TEST_P(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { +TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { const int kNumTrigger = 4; const int kNumLevels = 7; const int kNumKeys = 100; @@ -334,7 +327,7 @@ TEST_P(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); } -TEST_P(TieredCompactionTest, RangeBasedTieredStorageUniversal) { +TEST_F(TieredCompactionTest, RangeBasedTieredStorageUniversal) { const int kNumTrigger = 4; const int kNumLevels = 7; const int kNumKeys = 100; @@ -508,7 +501,7 @@ TEST_P(TieredCompactionTest, RangeBasedTieredStorageUniversal) { 1); } -TEST_P(TieredCompactionTest, LevelColdRangeDelete) { +TEST_F(TieredCompactionTest, LevelColdRangeDelete) { const int kNumTrigger = 4; const int kNumLevels = 7; const int kNumKeys = 100; @@ -614,7 +607,7 @@ class SingleKeySstPartitionerFactory : public SstPartitionerFactory { } }; -TEST_P(TieredCompactionTest, LevelOutofBoundaryRangeDelete) { +TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) { const int kNumTrigger = 4; const int kNumLevels = 3; const int kNumKeys = 10; @@ -743,7 +736,7 @@ TEST_P(TieredCompactionTest, LevelOutofBoundaryRangeDelete) { ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); } -TEST_P(TieredCompactionTest, UniversalRangeDelete) { +TEST_F(TieredCompactionTest, UniversalRangeDelete) { const int kNumTrigger = 4; const int kNumLevels = 7; const int kNumKeys = 10; @@ -875,7 +868,7 @@ TEST_P(TieredCompactionTest, UniversalRangeDelete) { ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); } -TEST_P(TieredCompactionTest, SequenceBasedTieredStorageLevel) { +TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) { const int kNumTrigger = 4; const int kNumLevels = 7; const int kNumKeys = 100; @@ -1099,7 +1092,7 @@ TEST_P(TieredCompactionTest, SequenceBasedTieredStorageLevel) { ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); } -TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) { +TEST_F(TieredCompactionTest, RangeBasedTieredStorageLevel) { const int kNumTrigger = 4; const int kNumLevels = 7; const int kNumKeys = 100; @@ -1240,10 +1233,7 @@ TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) { db_->ReleaseSnapshot(temp_snap); } -INSTANTIATE_TEST_CASE_P(TieredCompactionTest, TieredCompactionTest, - testing::Bool()); - -TEST_P(TieredCompactionTest, CheckInternalKeyRange) { +TEST_F(TieredCompactionTest, CheckInternalKeyRange) { // When compacting keys from the last level to penultimate level, // output to penultimate level should be within internal key range // of input files from penultimate level. @@ -1318,8 +1308,8 @@ TEST_P(TieredCompactionTest, CheckInternalKeyRange) { class PrecludeLastLevelTest : public DBTestBase { public: - PrecludeLastLevelTest() - : DBTestBase("preclude_last_level_test", /*env_do_fsync=*/false) { + PrecludeLastLevelTest(std::string test_name = "preclude_last_level_test") + : DBTestBase(test_name, /*env_do_fsync=*/false) { mock_clock_ = std::make_shared(env_->GetSystemClock()); mock_clock_->SetCurrentTime(kMockStartTime); mock_env_ = std::make_unique(env_, mock_clock_); @@ -1338,7 +1328,7 @@ class PrecludeLastLevelTest : public DBTestBase { SyncPoint::GetInstance()->SetCallBack( "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) { auto periodic_task_scheduler_ptr = - reinterpret_cast(arg); + static_cast(arg); periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get()); }); mock_clock_->SetCurrentTime(kMockStartTime); @@ -1575,9 +1565,8 @@ TEST_F(PrecludeLastLevelTest, SmallPrecludeTime) { ASSERT_EQ(tables_props.size(), 1); ASSERT_FALSE(tables_props.begin()->second->seqno_to_time_mapping.empty()); SeqnoToTimeMapping tp_mapping; - ASSERT_OK( - tp_mapping.Add(tables_props.begin()->second->seqno_to_time_mapping)); - ASSERT_OK(tp_mapping.Sort()); + ASSERT_OK(tp_mapping.DecodeFrom( + tables_props.begin()->second->seqno_to_time_mapping)); ASSERT_FALSE(tp_mapping.Empty()); auto seqs = tp_mapping.TEST_GetInternalMapping(); ASSERT_FALSE(seqs.empty()); @@ -1595,6 +1584,62 @@ TEST_F(PrecludeLastLevelTest, SmallPrecludeTime) { Close(); } +TEST_F(PrecludeLastLevelTest, FastTrackTimedPutToLastLevel) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preclude_last_level_data_seconds = 60; + options.preserve_internal_time_seconds = 0; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.last_level_temperature = Temperature::kCold; + DestroyAndReopen(options); + + Random rnd(301); + + dbfull()->TEST_WaitForPeriodicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(rnd.Uniform(10) + 1)); + }); + + for (int i = 0; i < kNumKeys / 2; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + dbfull()->TEST_WaitForPeriodicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(rnd.Uniform(2))); + }); + } + // Create one file with regular Put. + ASSERT_OK(Flush()); + + // Create one file with TimedPut. + // With above mock clock operations, write_unix_time 50 should be before + // current_time - preclude_last_level_seconds. + // These data are eligible to be put on the last level once written to db + // and compaction will fast track them to the last level. + for (int i = kNumKeys / 2; i < kNumKeys; i++) { + ASSERT_OK(TimedPut(0, Key(i), rnd.RandomString(100), 50)); + } + ASSERT_OK(Flush()); + + // TimedPut file moved to the last level immediately. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + + // Wait more than preclude_last_level time, Put file eventually moved to the + // last level. + mock_clock_->MockSleepForSeconds(100); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + Close(); +} + TEST_F(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) { const int kNumTrigger = 4; const int kNumLevels = 7; @@ -1824,7 +1869,6 @@ TEST_P(PrecludeLastLevelTestWithParms, PeriodicCompactionToPenultimateLevel) { options.env = mock_env_.get(); options.level0_file_num_compaction_trigger = kNumTrigger; options.num_levels = kNumLevels; - options.ignore_max_compaction_bytes_for_input = false; options.periodic_compaction_seconds = 10000; DestroyAndReopen(options); @@ -2213,6 +2257,309 @@ TEST_F(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) { Close(); } +// Tests DBIter::GetProperty("rocksdb.iterator.write-time") return a data's +// approximate write unix time. +// Test Param: +// 1) use tailing iterator or regular iterator (when it applies) +class IteratorWriteTimeTest : public PrecludeLastLevelTest, + public testing::WithParamInterface { + public: + IteratorWriteTimeTest() : PrecludeLastLevelTest("iterator_write_time_test") {} + + uint64_t VerifyKeyAndGetWriteTime(Iterator* iter, + const std::string& expected_key) { + std::string prop; + uint64_t write_time = 0; + EXPECT_TRUE(iter->Valid()); + EXPECT_EQ(expected_key, iter->key()); + EXPECT_OK(iter->GetProperty("rocksdb.iterator.write-time", &prop)); + Slice prop_slice = prop; + EXPECT_TRUE(GetFixed64(&prop_slice, &write_time)); + return write_time; + } + + void VerifyKeyAndWriteTime(Iterator* iter, const std::string& expected_key, + uint64_t expected_write_time) { + std::string prop; + uint64_t write_time = 0; + EXPECT_TRUE(iter->Valid()); + EXPECT_EQ(expected_key, iter->key()); + EXPECT_OK(iter->GetProperty("rocksdb.iterator.write-time", &prop)); + Slice prop_slice = prop; + EXPECT_TRUE(GetFixed64(&prop_slice, &write_time)); + EXPECT_EQ(expected_write_time, write_time); + } +}; + +TEST_P(IteratorWriteTimeTest, ReadFromMemtables) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kSecondsPerRecording = 101; + const int kKeyWithWriteTime = 25; + const uint64_t kUserSpecifiedWriteTime = + kMockStartTime + kSecondsPerRecording * 15; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.preserve_internal_time_seconds = 10000; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < kNumKeys; i++) { + dbfull()->TEST_WaitForPeriodicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); }); + if (i == kKeyWithWriteTime) { + ASSERT_OK( + TimedPut(Key(i), rnd.RandomString(100), kUserSpecifiedWriteTime)); + } else { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + } + } + + ReadOptions ropts; + ropts.tailing = GetParam(); + int i; + + // Forward iteration + uint64_t start_time = 0; + { + std::unique_ptr iter(dbfull()->NewIterator(ropts)); + for (iter->SeekToFirst(), i = 0; iter->Valid(); iter->Next(), i++) { + if (start_time == 0) { + start_time = VerifyKeyAndGetWriteTime(iter.get(), Key(i)); + } else if (i == kKeyWithWriteTime) { + VerifyKeyAndWriteTime(iter.get(), Key(i), kUserSpecifiedWriteTime); + } else { + VerifyKeyAndWriteTime(iter.get(), Key(i), + start_time + kSecondsPerRecording * (i + 1)); + } + } + ASSERT_OK(iter->status()); + } + + // Backward iteration + { + ropts.tailing = false; + std::unique_ptr iter(dbfull()->NewIterator(ropts)); + for (iter->SeekToLast(), i = kNumKeys - 1; iter->Valid(); + iter->Prev(), i--) { + if (i == 0) { + VerifyKeyAndWriteTime(iter.get(), Key(i), start_time); + } else if (i == kKeyWithWriteTime) { + VerifyKeyAndWriteTime(iter.get(), Key(i), kUserSpecifiedWriteTime); + } else { + VerifyKeyAndWriteTime(iter.get(), Key(i), + start_time + kSecondsPerRecording * (i + 1)); + } + } + ASSERT_OK(iter->status()); + } + + // Reopen the DB and disable the seqno to time recording, data with user + // specified write time can still get a write time before it's flushed. + options.preserve_internal_time_seconds = 0; + DestroyAndReopen(options); + ASSERT_OK(TimedPut(Key(kKeyWithWriteTime), rnd.RandomString(100), + kUserSpecifiedWriteTime)); + { + std::unique_ptr iter(dbfull()->NewIterator(ropts)); + iter->Seek(Key(kKeyWithWriteTime)); + VerifyKeyAndWriteTime(iter.get(), Key(kKeyWithWriteTime), + kUserSpecifiedWriteTime); + ASSERT_OK(iter->status()); + } + + ASSERT_OK(Flush()); + { + std::unique_ptr iter(dbfull()->NewIterator(ropts)); + iter->Seek(Key(kKeyWithWriteTime)); + VerifyKeyAndWriteTime(iter.get(), Key(kKeyWithWriteTime), + std::numeric_limits::max()); + ASSERT_OK(iter->status()); + } + + Close(); +} + +TEST_P(IteratorWriteTimeTest, ReadFromSstFile) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kSecondsPerRecording = 101; + const int kKeyWithWriteTime = 25; + const uint64_t kUserSpecifiedWriteTime = + kMockStartTime + kSecondsPerRecording * 15; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.preserve_internal_time_seconds = 10000; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < kNumKeys; i++) { + dbfull()->TEST_WaitForPeriodicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); }); + if (i == kKeyWithWriteTime) { + ASSERT_OK( + TimedPut(Key(i), rnd.RandomString(100), kUserSpecifiedWriteTime)); + } else { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + } + } + + ASSERT_OK(Flush()); + ReadOptions ropts; + ropts.tailing = GetParam(); + std::string prop; + int i; + + // Forward iteration + uint64_t start_time = 0; + { + std::unique_ptr iter(dbfull()->NewIterator(ropts)); + for (iter->SeekToFirst(), i = 0; iter->Valid(); iter->Next(), i++) { + if (start_time == 0) { + start_time = VerifyKeyAndGetWriteTime(iter.get(), Key(i)); + } else if (i == kKeyWithWriteTime) { + // It's not precisely kUserSpecifiedWriteTime, instead it has a margin + // of error that is one recording apart while we convert write time to + // sequence number, and then back to write time. + VerifyKeyAndWriteTime(iter.get(), Key(i), + kUserSpecifiedWriteTime - kSecondsPerRecording); + } else { + VerifyKeyAndWriteTime(iter.get(), Key(i), + start_time + kSecondsPerRecording * (i + 1)); + } + } + ASSERT_OK(iter->status()); + } + + // Backward iteration + { + ropts.tailing = false; + std::unique_ptr iter(dbfull()->NewIterator(ropts)); + for (iter->SeekToLast(), i = kNumKeys - 1; iter->Valid(); + iter->Prev(), i--) { + if (i == 0) { + VerifyKeyAndWriteTime(iter.get(), Key(i), start_time); + } else if (i == kKeyWithWriteTime) { + VerifyKeyAndWriteTime(iter.get(), Key(i), + kUserSpecifiedWriteTime - kSecondsPerRecording); + } else { + VerifyKeyAndWriteTime(iter.get(), Key(i), + start_time + kSecondsPerRecording * (i + 1)); + } + } + ASSERT_OK(iter->status()); + } + + // Reopen the DB and disable the seqno to time recording. Data retrieved from + // SST files still have write time available. + options.preserve_internal_time_seconds = 0; + DestroyAndReopen(options); + + dbfull()->TEST_WaitForPeriodicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); }); + ASSERT_OK(Put("a", "val")); + ASSERT_TRUE(dbfull()->TEST_GetSeqnoToTimeMapping().Empty()); + + { + std::unique_ptr iter(dbfull()->NewIterator(ropts)); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + // "a" is retrieved from memtable, its write time is unknown because the + // seqno to time mapping recording is not available. + VerifyKeyAndWriteTime(iter.get(), "a", + std::numeric_limits::max()); + for (iter->Next(), i = 0; iter->Valid(); iter->Next(), i++) { + if (i == 0) { + VerifyKeyAndWriteTime(iter.get(), Key(i), start_time); + } else if (i == kKeyWithWriteTime) { + VerifyKeyAndWriteTime(iter.get(), Key(i), + kUserSpecifiedWriteTime - kSecondsPerRecording); + } else { + VerifyKeyAndWriteTime(iter.get(), Key(i), + start_time + kSecondsPerRecording * (i + 1)); + } + } + ASSERT_OK(iter->status()); + } + + // There is no write time info for "a" after it's flushed to SST file either. + ASSERT_OK(Flush()); + { + std::unique_ptr iter(dbfull()->NewIterator(ropts)); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + VerifyKeyAndWriteTime(iter.get(), "a", + std::numeric_limits::max()); + } + + // Sequence number zeroed out after compacted to the last level, write time + // all becomes zero. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + { + std::unique_ptr iter(dbfull()->NewIterator(ropts)); + iter->SeekToFirst(); + for (iter->Next(), i = 0; iter->Valid(); iter->Next(), i++) { + VerifyKeyAndWriteTime(iter.get(), Key(i), 0); + } + ASSERT_OK(iter->status()); + } + Close(); +} + +TEST_P(IteratorWriteTimeTest, MergeReturnsBaseValueWriteTime) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kSecondsPerRecording = 101; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.preserve_internal_time_seconds = 10000; + options.num_levels = kNumLevels; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + DestroyAndReopen(options); + + dbfull()->TEST_WaitForPeriodicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); }); + ASSERT_OK(Put("foo", "fv1")); + + dbfull()->TEST_WaitForPeriodicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); }); + ASSERT_OK(Put("bar", "bv1")); + ASSERT_OK(Merge("foo", "bv1")); + + ReadOptions ropts; + ropts.tailing = GetParam(); + { + std::unique_ptr iter(dbfull()->NewIterator(ropts)); + iter->SeekToFirst(); + uint64_t bar_time = VerifyKeyAndGetWriteTime(iter.get(), "bar"); + iter->Next(); + uint64_t foo_time = VerifyKeyAndGetWriteTime(iter.get(), "foo"); + // "foo" has an older write time because its base value's write time is used + ASSERT_GT(bar_time, foo_time); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + Close(); +} + +INSTANTIATE_TEST_CASE_P(IteratorWriteTimeTest, IteratorWriteTimeTest, + testing::Bool()); + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index 0bf79bef191..f9c0f47ef7b 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -170,7 +170,7 @@ void DoRandomIteraratorTest(DB* db, std::vector source_strings, class DoubleComparator : public Comparator { public: - DoubleComparator() {} + DoubleComparator() = default; const char* Name() const override { return "DoubleComparator"; } @@ -198,7 +198,7 @@ class DoubleComparator : public Comparator { class HashComparator : public Comparator { public: - HashComparator() {} + HashComparator() = default; const char* Name() const override { return "HashComparator"; } @@ -221,7 +221,7 @@ class HashComparator : public Comparator { class TwoStrComparator : public Comparator { public: - TwoStrComparator() {} + TwoStrComparator() = default; const char* Name() const override { return "TwoStrComparator"; } @@ -372,7 +372,7 @@ TEST_P(ComparatorDBTest, Uint64Comparator) { uint64_t r = rnd64.Next(); std::string str; str.resize(8); - memcpy(&str[0], static_cast(&r), 8); + memcpy(str.data(), static_cast(&r), 8); source_strings.push_back(str); } diff --git a/db/convenience.cc b/db/convenience.cc index 08bddc8e8f6..9e78adc74e4 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -34,7 +34,7 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, const std::string& file_path) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; return VerifySstFileChecksum(options, env_options, read_options, file_path); } diff --git a/db/corruption_test.cc b/db/corruption_test.cc index d1cb022588f..ab8167409fe 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -53,10 +53,9 @@ class ErrorFS : public FileSystemWrapper { num_writable_file_errors_(0) {} const char* Name() const override { return "ErrorEnv"; } - virtual IOStatus NewWritableFile(const std::string& fname, - const FileOptions& opts, - std::unique_ptr* result, - IODebugContext* dbg) override { + IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (writable_file_error_) { ++num_writable_file_errors_; @@ -823,7 +822,6 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) { Options options; options.level_compaction_dynamic_level_bytes = false; options.env = env_.get(); - options.check_flush_compaction_key_order = false; options.paranoid_file_checks = true; options.create_if_missing = true; Status s; @@ -854,7 +852,6 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) { options.env = env_.get(); options.paranoid_file_checks = true; options.create_if_missing = true; - options.check_flush_compaction_key_order = false; Status s; for (const auto& mode : corruption_modes) { delete db_; @@ -886,7 +883,6 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) { Options options; options.level_compaction_dynamic_level_bytes = false; options.env = env_.get(); - options.check_flush_compaction_key_order = false; options.paranoid_file_checks = true; options.create_if_missing = true; for (bool do_flush : {true, false}) { @@ -923,7 +919,6 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) { Options options; options.level_compaction_dynamic_level_bytes = false; options.env = env_.get(); - options.check_flush_compaction_key_order = false; options.paranoid_file_checks = true; options.create_if_missing = true; for (bool do_flush : {true, false}) { @@ -963,7 +958,6 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) { Options options; options.level_compaction_dynamic_level_bytes = false; options.env = env_.get(); - options.check_flush_compaction_key_order = false; options.paranoid_file_checks = true; options.create_if_missing = true; for (bool do_flush : {true, false}) { @@ -1032,7 +1026,6 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) { options.env = env_.get(); options.paranoid_file_checks = false; options.create_if_missing = true; - options.check_flush_compaction_key_order = false; delete db_; db_ = nullptr; ASSERT_OK(DestroyDB(dbname_, options)); @@ -1047,7 +1040,6 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) { ASSERT_OK(dbi->TEST_FlushMemTable()); mock->SetCorruptionMode(mock::MockTableFactory::kCorruptNone); - ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}})); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_NOK( @@ -1060,7 +1052,6 @@ TEST_F(CorruptionTest, FlushKeyOrderCheck) { options.env = env_.get(); options.paranoid_file_checks = false; options.create_if_missing = true; - ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}})); ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1")); ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1")); @@ -1085,28 +1076,6 @@ TEST_F(CorruptionTest, FlushKeyOrderCheck) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } -TEST_F(CorruptionTest, DisableKeyOrderCheck) { - ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "false"}})); - DBImpl* dbi = static_cast_with_check(db_); - - SyncPoint::GetInstance()->SetCallBack( - "OutputValidator::Add:order_check", - [&](void* /*arg*/) { ASSERT_TRUE(false); }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1")); - ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1")); - ASSERT_OK(dbi->TEST_FlushMemTable()); - ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1")); - ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1")); - ASSERT_OK(dbi->TEST_FlushMemTable()); - CompactRangeOptions cro; - cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; - ASSERT_OK( - dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr)); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); -} - TEST_F(CorruptionTest, VerifyWholeTableChecksum) { CloseDb(); Options options; @@ -1133,7 +1102,7 @@ TEST_F(CorruptionTest, VerifyWholeTableChecksum) { int count{0}; SyncPoint::GetInstance()->SetCallBack( "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) { - auto* s = reinterpret_cast(arg); + auto* s = static_cast(arg); ASSERT_NE(s, nullptr); ++count; ASSERT_NOK(*s); @@ -1278,7 +1247,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) { SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) { - auto* tmp_s = reinterpret_cast(arg); + auto* tmp_s = static_cast(arg); assert(tmp_s); *tmp_s = Status::IOError("Injected"); }); @@ -1460,7 +1429,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) { SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "DBImpl::Open::BeforeSyncWAL", [&](void* arg) { - auto* tmp_s = reinterpret_cast(arg); + auto* tmp_s = static_cast(arg); assert(tmp_s); *tmp_s = Status::IOError("Injected"); }); @@ -1628,7 +1597,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecoveryWithFlush) { SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) { - auto* tmp_s = reinterpret_cast(arg); + auto* tmp_s = static_cast(arg); assert(tmp_s); *tmp_s = Status::IOError("Injected"); }); diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index dec5c05a335..84cd5f88380 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -131,7 +131,7 @@ TEST_F(CuckooTableDBTest, Flush) { ASSERT_OK(dbfull()->TEST_FlushMemTable()); TablePropertiesCollection ptc; - ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + ASSERT_OK(static_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); VerifySstUniqueIds(ptc); ASSERT_EQ(1U, ptc.size()); ASSERT_EQ(3U, ptc.begin()->second->num_entries); @@ -148,7 +148,7 @@ TEST_F(CuckooTableDBTest, Flush) { ASSERT_OK(Put("key6", "v6")); ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + ASSERT_OK(static_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); VerifySstUniqueIds(ptc); ASSERT_EQ(2U, ptc.size()); auto row = ptc.begin(); @@ -166,7 +166,7 @@ TEST_F(CuckooTableDBTest, Flush) { ASSERT_OK(Delete("key5")); ASSERT_OK(Delete("key4")); ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + ASSERT_OK(static_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); VerifySstUniqueIds(ptc); ASSERT_EQ(3U, ptc.size()); row = ptc.begin(); @@ -191,7 +191,7 @@ TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) { ASSERT_OK(dbfull()->TEST_FlushMemTable()); TablePropertiesCollection ptc; - ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + ASSERT_OK(static_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); VerifySstUniqueIds(ptc); ASSERT_EQ(1U, ptc.size()); ASSERT_EQ(2U, ptc.begin()->second->num_entries); @@ -209,7 +209,7 @@ static std::string Key(int i) { static std::string Uint64Key(uint64_t i) { std::string str; str.resize(8); - memcpy(&str[0], static_cast(&i), 8); + memcpy(str.data(), static_cast(&i), 8); return str; } } // namespace. diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index ba260947380..2a4e3d411c2 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -25,10 +25,60 @@ #include "util/random.h" #include "utilities/counted_fs.h" #include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" namespace ROCKSDB_NAMESPACE { +namespace { +class MyFlushBlockPolicy : public FlushBlockPolicy { + public: + explicit MyFlushBlockPolicy(const int num_keys_in_block, + const BlockBuilder& data_block_builder) + : num_keys_in_block_(num_keys_in_block), + num_keys_(0), + data_block_builder_(data_block_builder) {} + + bool Update(const Slice& /*key*/, const Slice& /*value*/) override { + if (data_block_builder_.empty()) { + // First key in this block + num_keys_ = 1; + return false; + } + // Flush every 10 keys + if (num_keys_ == num_keys_in_block_) { + num_keys_ = 1; + return true; + } + num_keys_++; + return false; + } + + private: + const int num_keys_in_block_; + int num_keys_; + const BlockBuilder& data_block_builder_; +}; + +class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory { + public: + explicit MyFlushBlockPolicyFactory(const int num_keys_in_block) + : num_keys_in_block_(num_keys_in_block) {} + + virtual const char* Name() const override { + return "MyFlushBlockPolicyFactory"; + } + + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& data_block_builder) const override { + return new MyFlushBlockPolicy(num_keys_in_block_, data_block_builder); + } + + private: + const int num_keys_in_block_; +}; +} // namespace static bool enable_io_uring = true; extern "C" bool RocksDbIOUringEnable() { return enable_io_uring; } @@ -392,6 +442,41 @@ TEST_F(DBBasicTest, PutSingleDeleteGet) { kSkipMergePut)); } +TEST_F(DBBasicTest, TimedPutBasic) { + do { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(TimedPut(1, "foo", "v1", /*write_unix_time=*/0)); + // Read from memtable + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(TimedPut(1, "foo", "v2.1", /*write_unix_time=*/3)); + ASSERT_EQ("v2.1", Get(1, "foo")); + + // Read from sst file + ASSERT_OK(db_->Flush(FlushOptions(), handles_[1])); + ASSERT_OK(Merge(1, "foo", "v2.2")); + ASSERT_EQ("v2.1,v2.2", Get(1, "foo")); + ASSERT_OK(Delete(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + + ASSERT_OK(TimedPut(1, "bar", "bv1", /*write_unix_time=*/0)); + ASSERT_EQ("bv1", Get(1, "bar")); + ASSERT_OK(TimedPut(1, "baz", "bzv1", /*write_unix_time=*/0)); + ASSERT_EQ("bzv1", Get(1, "baz")); + std::string range_del_begin = "b"; + std::string range_del_end = "baz"; + Slice begin_rdel = range_del_begin, end_rdel = range_del_end; + ASSERT_OK( + db_->DeleteRange(WriteOptions(), handles_[1], begin_rdel, end_rdel)); + ASSERT_EQ("NOT_FOUND", Get(1, "bar")); + + ASSERT_EQ("bzv1", Get(1, "baz")); + ASSERT_OK(SingleDelete(1, "baz")); + ASSERT_EQ("NOT_FOUND", Get(1, "baz")); + } while (ChangeOptions(kSkipPlainTable)); +} + TEST_F(DBBasicTest, EmptyFlush) { // It is possible to produce empty flushes when using single deletes. Tests // whether empty flushes cause issues. @@ -1318,9 +1403,9 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) { for (int i = 0; i < num_keys; ++i) { int cf = i / 3; int cf_key = 1 % 3; - cf_kv_vec.emplace_back(std::make_tuple( + cf_kv_vec.emplace_back( cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key), - "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key))); + "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key)); ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]), std::get<2>(cf_kv_vec[i]))); } @@ -1853,6 +1938,214 @@ TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevel) { } } +TEST_P(DBMultiGetTestWithParam, MultiGetDuplicatesEmptyLevel) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_BYPASS("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + // Skip for unbatched MultiGet + if (!std::get<0>(GetParam())) { + ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(env_->GetFileSystem())); + std::unique_ptr env(new CompositeEnvWrapper(env_, fault_fs)); + Options options = CurrentOptions(); + options.env = env.get(); + options.disable_auto_compactions = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + + LRUCacheOptions cache_opts; + cache_opts.capacity = 1 << 20; + + BlockBasedTableOptions table_opts; + table_opts.metadata_cache_options.top_level_index_pinning = PinningTier::kAll; + table_opts.metadata_cache_options.partition_pinning = PinningTier::kNone; + table_opts.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_opts.cache_index_and_filter_blocks = true; + table_opts.block_cache = cache_opts.MakeSharedCache(); + table_opts.flush_block_policy_factory.reset(new MyFlushBlockPolicyFactory(1)); + + options.table_factory.reset(new BlockBasedTableFactory(table_opts)); + Reopen(options); + int key; + + // Setup the LSM so that the following search bounds are generated for + // key 9 for each level - + // Level 1 - lb = 0, rb = max + // Level 2 - lb = 0, rb = 0 + // Level 3 - lb = 0, rb = -1 + // Level 4 - lb = 0, rb = 0 + + key = 9; + ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + + key = 5; + ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); + key = 9; + ASSERT_OK( + Merge("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); + const Snapshot* snap = dbfull()->GetSnapshot(); + ASSERT_OK( + Merge("key_" + std::to_string(key), "val_l2_ext_" + std::to_string(key))); + ASSERT_OK(Flush()); + // Leave level 3 empty + MoveFilesToLevel(2); + + key = 2; + ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); + key = 6; + ASSERT_OK( + Merge("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + std::vector keys; + std::vector values; + + keys.push_back("key_" + std::to_string(9)); + keys.push_back("key_" + std::to_string(9)); + + int num_reads = 0; + SyncPoint::GetInstance()->SetCallBack( + "FaultInjectionTestFS::RandomRead", [&](void*) { + ++num_reads; + // Fail on the 2nd read. First read is index partition, + // second read is data block in level 1 + if (num_reads == 2) { + fault_fs->SetFilesystemActive(false); + } else { + fault_fs->SetFilesystemActive(true); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + size_t capacity = table_opts.block_cache->GetCapacity(); + table_opts.block_cache->SetCapacity(0); + table_opts.block_cache->SetCapacity(capacity); + + values = MultiGet(keys, nullptr, std::get<1>(GetParam())); + ASSERT_EQ(values.size(), 2); + + SyncPoint::GetInstance()->DisableProcessing(); + dbfull()->ReleaseSnapshot(snap); + Destroy(options); +} + +TEST_P(DBMultiGetTestWithParam, MultiGetDuplicatesNonEmptyLevel) { +#ifndef USE_COROUTINES + if (std::get<1>(GetParam())) { + ROCKSDB_GTEST_BYPASS("This test requires coroutine support"); + return; + } +#endif // USE_COROUTINES + // Skip for unbatched MultiGet + if (!std::get<0>(GetParam())) { + ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(env_->GetFileSystem())); + std::unique_ptr env(new CompositeEnvWrapper(env_, fault_fs)); + Options options = CurrentOptions(); + options.env = env.get(); + options.disable_auto_compactions = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + + LRUCacheOptions cache_opts; + cache_opts.capacity = 1 << 20; + + BlockBasedTableOptions table_opts; + table_opts.metadata_cache_options.top_level_index_pinning = PinningTier::kAll; + table_opts.metadata_cache_options.partition_pinning = PinningTier::kNone; + table_opts.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_opts.cache_index_and_filter_blocks = true; + table_opts.block_cache = cache_opts.MakeSharedCache(); + table_opts.flush_block_policy_factory.reset(new MyFlushBlockPolicyFactory(1)); + + options.table_factory.reset(new BlockBasedTableFactory(table_opts)); + Reopen(options); + int key; + + // Setup the LSM so that the following search bounds are generated for + // key 9 for each level - + // Level 1 - lb = 0, rb = max + // Level 2 - lb = 0, rb = 0 + // Level 3 - lb = 0, rb = 1 + // Level 4 - N/A + + key = 8; + ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + + key = 7; + ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); + ASSERT_OK(Flush()); + + key = 9; + ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + + key = 5; + ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); + key = 9; + ASSERT_OK( + Merge("key_" + std::to_string(key), "merge1_l2_" + std::to_string(key))); + const Snapshot* snap = dbfull()->GetSnapshot(); + ASSERT_OK( + Merge("key_" + std::to_string(key), "merge2_l2_" + std::to_string(key))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + key = 2; + ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); + key = 6; + ASSERT_OK( + Merge("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + std::vector keys; + std::vector values; + + keys.push_back("key_" + std::to_string(9)); + keys.push_back("key_" + std::to_string(9)); + + int num_reads = 0; + SyncPoint::GetInstance()->SetCallBack( + "FaultInjectionTestFS::RandomRead", [&](void*) { + ++num_reads; + // Fail on the 2nd read. First read is index partition, + // second read is data block in level 1 + if (num_reads == 2) { + fault_fs->SetFilesystemActive(false); + } else { + fault_fs->SetFilesystemActive(true); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + size_t capacity = table_opts.block_cache->GetCapacity(); + table_opts.block_cache->SetCapacity(0); + table_opts.block_cache->SetCapacity(capacity); + + values = MultiGet(keys, nullptr, std::get<1>(GetParam())); + ASSERT_EQ(values.size(), 2); + ASSERT_EQ(values[0], "Corruption: Not active"); + ASSERT_EQ(values[1], "val_l2_9,merge1_l2_9,merge2_l2_9"); + + SyncPoint::GetInstance()->DisableProcessing(); + dbfull()->ReleaseSnapshot(snap); + Destroy(options); +} + TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevelMerge) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { @@ -2349,9 +2642,9 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1) { key_strs.push_back(Key(33)); key_strs.push_back(Key(54)); key_strs.push_back(Key(102)); - keys.push_back(key_strs[0]); - keys.push_back(key_strs[1]); - keys.push_back(key_strs[2]); + keys.emplace_back(key_strs[0]); + keys.emplace_back(key_strs[1]); + keys.emplace_back(key_strs[2]); values.resize(keys.size()); statuses.resize(keys.size()); @@ -2396,9 +2689,9 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1Error) { key_strs.push_back(Key(33)); key_strs.push_back(Key(54)); key_strs.push_back(Key(102)); - keys.push_back(key_strs[0]); - keys.push_back(key_strs[1]); - keys.push_back(key_strs[2]); + keys.emplace_back(key_strs[0]); + keys.emplace_back(key_strs[1]); + keys.emplace_back(key_strs[2]); values.resize(keys.size()); statuses.resize(keys.size()); @@ -2461,9 +2754,9 @@ TEST_P(DBMultiGetAsyncIOTest, LastKeyInFile) { key_strs.push_back(Key(21)); key_strs.push_back(Key(54)); key_strs.push_back(Key(102)); - keys.push_back(key_strs[0]); - keys.push_back(key_strs[1]); - keys.push_back(key_strs[2]); + keys.emplace_back(key_strs[0]); + keys.emplace_back(key_strs[1]); + keys.emplace_back(key_strs[2]); values.resize(keys.size()); statuses.resize(keys.size()); @@ -2506,9 +2799,9 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2) { key_strs.push_back(Key(33)); key_strs.push_back(Key(56)); key_strs.push_back(Key(102)); - keys.push_back(key_strs[0]); - keys.push_back(key_strs[1]); - keys.push_back(key_strs[2]); + keys.emplace_back(key_strs[0]); + keys.emplace_back(key_strs[1]); + keys.emplace_back(key_strs[2]); values.resize(keys.size()); statuses.resize(keys.size()); @@ -2549,8 +2842,8 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) { // 19 and 26 are in L2, but overlap with L0 and L1 file ranges key_strs.push_back(Key(19)); key_strs.push_back(Key(26)); - keys.push_back(key_strs[0]); - keys.push_back(key_strs[1]); + keys.emplace_back(key_strs[0]); + keys.emplace_back(key_strs[1]); values.resize(keys.size()); statuses.resize(keys.size()); @@ -2585,8 +2878,8 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) { // 139 and 163 are in L2, but overlap with a range deletes in L1 key_strs.push_back(Key(139)); key_strs.push_back(Key(163)); - keys.push_back(key_strs[0]); - keys.push_back(key_strs[1]); + keys.emplace_back(key_strs[0]); + keys.emplace_back(key_strs[1]); values.resize(keys.size()); statuses.resize(keys.size()); @@ -2615,9 +2908,9 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2WithRangeDelInL1) { key_strs.push_back(Key(139)); key_strs.push_back(Key(144)); key_strs.push_back(Key(163)); - keys.push_back(key_strs[0]); - keys.push_back(key_strs[1]); - keys.push_back(key_strs[2]); + keys.emplace_back(key_strs[0]); + keys.emplace_back(key_strs[1]); + keys.emplace_back(key_strs[2]); values.resize(keys.size()); statuses.resize(keys.size()); @@ -2648,9 +2941,9 @@ TEST_P(DBMultiGetAsyncIOTest, GetNoIOUring) { key_strs.push_back(Key(33)); key_strs.push_back(Key(54)); key_strs.push_back(Key(102)); - keys.push_back(key_strs[0]); - keys.push_back(key_strs[1]); - keys.push_back(key_strs[2]); + keys.emplace_back(key_strs[0]); + keys.emplace_back(key_strs[1]); + keys.emplace_back(key_strs[2]); values.resize(keys.size()); statuses.resize(keys.size()); @@ -3029,9 +3322,9 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) { // Warm up the cache first key_data.emplace_back(Key(0)); - keys.emplace_back(Slice(key_data.back())); + keys.emplace_back(key_data.back()); key_data.emplace_back(Key(50)); - keys.emplace_back(Slice(key_data.back())); + keys.emplace_back(key_data.back()); statuses.resize(keys.size()); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), @@ -3086,8 +3379,7 @@ TEST_F(DBBasicTest, BestEffortsRecoveryWithVersionBuildingFailure) { SyncPoint::GetInstance()->SetCallBack( "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { ASSERT_NE(nullptr, arg); - *(reinterpret_cast(arg)) = - Status::Corruption("Inject corruption"); + *(static_cast(arg)) = Status::Corruption("Inject corruption"); }); SyncPoint::GetInstance()->EnableProcessing(); @@ -3128,7 +3420,8 @@ TEST_F(DBBasicTest, LastSstFileNotInManifest) { // Manually add a sst file. constexpr uint64_t kSstFileNumber = 100; const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber); - ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content", + ASSERT_OK(WriteStringToFile(env_, + /* data = */ "bad sst file content", /* fname = */ kSstFile, /* should_sync = */ true)); ASSERT_OK(env_->FileExists(kSstFile)); @@ -3405,10 +3698,10 @@ TEST_F(DBBasicTest, ConcurrentlyCloseDB) { DestroyAndReopen(options); std::vector workers; for (int i = 0; i < 10; i++) { - workers.push_back(std::thread([&]() { + workers.emplace_back([&]() { auto s = db_->Close(); ASSERT_OK(s); - })); + }); } for (auto& w : workers) { w.join(); @@ -3517,7 +3810,7 @@ class DBBasicTestMultiGet : public DBTestBase { table_options.pin_l0_filter_and_index_blocks_in_cache = true; } table_options.flush_block_policy_factory.reset( - new MyFlushBlockPolicyFactory()); + new MyFlushBlockPolicyFactory(10)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); if (!compression_enabled_) { options.compression = kNoCompression; @@ -3605,46 +3898,6 @@ class DBBasicTestMultiGet : public DBTestBase { static void TearDownTestCase() {} protected: - class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory { - public: - MyFlushBlockPolicyFactory() {} - - virtual const char* Name() const override { - return "MyFlushBlockPolicyFactory"; - } - - virtual FlushBlockPolicy* NewFlushBlockPolicy( - const BlockBasedTableOptions& /*table_options*/, - const BlockBuilder& data_block_builder) const override { - return new MyFlushBlockPolicy(data_block_builder); - } - }; - - class MyFlushBlockPolicy : public FlushBlockPolicy { - public: - explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder) - : num_keys_(0), data_block_builder_(data_block_builder) {} - - bool Update(const Slice& /*key*/, const Slice& /*value*/) override { - if (data_block_builder_.empty()) { - // First key in this block - num_keys_ = 1; - return false; - } - // Flush every 10 keys - if (num_keys_ == 10) { - num_keys_ = 1; - return true; - } - num_keys_++; - return false; - } - - private: - int num_keys_; - const BlockBuilder& data_block_builder_; - }; - class MyBlockCache : public CacheWrapper { public: explicit MyBlockCache(std::shared_ptr target) @@ -3722,9 +3975,9 @@ TEST_P(DBBasicTestWithParallelIO, MultiGet) { // Warm up the cache first key_data.emplace_back(Key(0)); - keys.emplace_back(Slice(key_data.back())); + keys.emplace_back(key_data.back()); key_data.emplace_back(Key(50)); - keys.emplace_back(Slice(key_data.back())); + keys.emplace_back(key_data.back()); statuses.resize(keys.size()); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), @@ -3903,9 +4156,9 @@ TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) { // Warm up the cache first key_data.emplace_back(Key(0)); - keys.emplace_back(Slice(key_data.back())); + keys.emplace_back(key_data.back()); key_data.emplace_back(Key(50)); - keys.emplace_back(Slice(key_data.back())); + keys.emplace_back(key_data.back()); statuses.resize(keys.size()); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), @@ -3973,9 +4226,9 @@ TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) { // Warm up the cache first key_data.emplace_back(Key(0)); - keys.emplace_back(Slice(key_data.back())); + keys.emplace_back(key_data.back()); key_data.emplace_back(Key(50)); - keys.emplace_back(Slice(key_data.back())); + keys.emplace_back(key_data.back()); statuses.resize(keys.size()); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), @@ -4021,9 +4274,9 @@ TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) { // Warm up the cache first key_data.emplace_back(Key(0)); - keys.emplace_back(Slice(key_data.back())); + keys.emplace_back(key_data.back()); key_data.emplace_back(Key(50)); - keys.emplace_back(Slice(key_data.back())); + keys.emplace_back(key_data.back()); statuses.resize(keys.size()); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), @@ -4061,7 +4314,7 @@ class DeadlineRandomAccessFile : public FSRandomAccessFileOwnerWrapper { const IOOptions& options, IODebugContext* dbg) override; IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, - std::function cb, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override; @@ -4207,7 +4460,7 @@ IOStatus DeadlineRandomAccessFile::Read(uint64_t offset, size_t len, IOStatus DeadlineRandomAccessFile::ReadAsync( FSReadRequest& req, const IOOptions& opts, - std::function cb, void* cb_arg, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) { const std::chrono::microseconds deadline = fs_.GetDeadline(); const std::chrono::microseconds io_timeout = fs_.GetIOTimeout(); @@ -4288,68 +4541,35 @@ TEST_P(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) { SetTimeElapseOnlySleepOnReopen(&options); ReopenWithColumnFamilies(GetCFNames(), options); - // Test the non-batched version of MultiGet with multiple column - // families + // Test batched MultiGet with an IO delay in the first data block read. + // Both keys in the first CF should succeed as they're in the same data + // block and would form one batch, and we check for deadline between + // batches. std::vector key_str; size_t i; - for (i = 0; i < 5; ++i) { + for (i = 0; i < 10; ++i) { key_str.emplace_back(Key(static_cast(i))); } std::vector cfs(key_str.size()); - ; std::vector keys(key_str.size()); - std::vector values(key_str.size()); + std::vector pin_values(keys.size()); + for (i = 0; i < key_str.size(); ++i) { - cfs[i] = handles_[i]; + // 2 keys per CF + cfs[i] = handles_[i / 2]; keys[i] = Slice(key_str[i].data(), key_str[i].size()); } - ReadOptions ro; ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; ro.async_io = GetParam(); // Delay the first IO fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0); - std::vector statuses = dbfull()->MultiGet(ro, cfs, keys, &values); - // The first key is successful because we check after the lookup, but - // subsequent keys fail due to deadline exceeded - CheckStatus(statuses, 1); - - // Clear the cache - cache->SetCapacity(0); - cache->SetCapacity(1048576); - // Test non-batched Multiget with multiple column families and - // introducing an IO delay in one of the middle CFs - key_str.clear(); - for (i = 0; i < 10; ++i) { - key_str.emplace_back(Key(static_cast(i))); - } - cfs.resize(key_str.size()); - keys.resize(key_str.size()); - values.resize(key_str.size()); - for (i = 0; i < key_str.size(); ++i) { - // 2 keys per CF - cfs[i] = handles_[i / 2]; - keys[i] = Slice(key_str[i].data(), key_str[i].size()); - } - ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; - fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1); - statuses = dbfull()->MultiGet(ro, cfs, keys, &values); - CheckStatus(statuses, 3); - - // Test batched MultiGet with an IO delay in the first data block read. - // Both keys in the first CF should succeed as they're in the same data - // block and would form one batch, and we check for deadline between - // batches. - std::vector pin_values(keys.size()); - cache->SetCapacity(0); - cache->SetCapacity(1048576); - statuses.clear(); - statuses.resize(keys.size()); - ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; - fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0); + std::vector statuses(key_str.size()); dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), pin_values.data(), statuses.data()); + // The first key is successful because we check after the lookup, but + // subsequent keys fail due to deadline exceeded CheckStatus(statuses, 2); // Similar to the previous one, but an IO delay in the third CF data block @@ -4427,7 +4647,7 @@ TEST_F(DBBasicTest, ManifestWriteFailure) { SyncPoint::GetInstance()->SetCallBack( "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) { ASSERT_NE(nullptr, arg); - auto* s = reinterpret_cast(arg); + auto* s = static_cast(arg); ASSERT_OK(*s); // Manually overwrite return status *s = Status::IOError(); @@ -4482,7 +4702,7 @@ TEST_F(DBBasicTest, FailOpenIfLoggerCreationFail) { SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "rocksdb::CreateLoggerFromOptions:AfterGetPath", [&](void* arg) { - auto* s = reinterpret_cast(arg); + auto* s = static_cast(arg); assert(s); *s = Status::IOError("Injected"); }); @@ -4560,7 +4780,7 @@ TEST_F(DBBasicTest, VerifyFileChecksumsReadahead) { uint64_t number; FileType type; ASSERT_OK(env_->GetChildren(dbname_, &filenames)); - for (auto name : filenames) { + for (const auto& name : filenames) { if (ParseFileName(name, &number, &type)) { if (type == kTableFile) { sst_cnt++; diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 4acdc64b222..e40124bbee8 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -744,7 +744,7 @@ TEST_F(DBBlockCacheTest, AddRedundantStats) { const size_t capacity = size_t{1} << 25; const int num_shard_bits = 0; // 1 shard int iterations_tested = 0; - for (std::shared_ptr base_cache : + for (const std::shared_ptr& base_cache : {NewLRUCache(capacity, num_shard_bits), // FixedHyperClockCache HyperClockCacheOptions( @@ -990,7 +990,7 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { int iterations_tested = 0; for (bool partition : {false, true}) { SCOPED_TRACE("Partition? " + std::to_string(partition)); - for (std::shared_ptr cache : + for (const std::shared_ptr& cache : {NewLRUCache(capacity), HyperClockCacheOptions( capacity, @@ -1251,7 +1251,7 @@ void DummyFillCache(Cache& cache, size_t entry_size, class CountingLogger : public Logger { public: - ~CountingLogger() override {} + ~CountingLogger() override = default; using Logger::Logv; void Logv(const InfoLogLevel log_level, const char* format, va_list /*ap*/) override { @@ -1372,7 +1372,7 @@ class StableCacheKeyTestFS : public FaultInjectionTestFS { SetFailGetUniqueId(true); } - virtual ~StableCacheKeyTestFS() override {} + ~StableCacheKeyTestFS() override = default; IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&, IODebugContext*) override { @@ -1423,7 +1423,7 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey", [&](void* arg) { - TableProperties* props = reinterpret_cast(arg); + TableProperties* props = static_cast(arg); props->orig_file_number = 0; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -1565,7 +1565,7 @@ class CacheKeyTest : public testing::Test { tp_.db_id = std::to_string(db_id_); tp_.orig_file_number = file_number; bool is_stable; - std::string cur_session_id = ""; // ignored + std::string cur_session_id; // ignored uint64_t cur_file_number = 42; // ignored OffsetableCacheKey rv; BlockBasedTable::SetupBaseCacheKey(&tp_, cur_session_id, cur_file_number, @@ -1833,6 +1833,7 @@ class DBBlockCachePinningTest PinningTier unpartitioned_pinning_; }; +#ifdef LZ4 TEST_P(DBBlockCachePinningTest, TwoLevelDB) { // Creates one file in L0 and one file in L1. Both files have enough data that // their index and filter blocks are partitioned. The L1 file will also have @@ -1844,10 +1845,7 @@ TEST_P(DBBlockCachePinningTest, TwoLevelDB) { const int kNumKeysPerFile = kBlockSize * kNumBlocksPerFile / kKeySize; Options options = CurrentOptions(); - // `kNoCompression` makes the unit test more portable. But it relies on the - // current behavior of persisting/accessing dictionary even when there's no - // (de)compression happening, which seems fairly likely to change over time. - options.compression = kNoCompression; + options.compression = kLZ4Compression; options.compression_opts.max_dict_bytes = 4 << 10; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); BlockBasedTableOptions table_options; @@ -1960,6 +1958,7 @@ TEST_P(DBBlockCachePinningTest, TwoLevelDB) { ASSERT_EQ(expected_compression_dict_misses, TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS)); } +#endif INSTANTIATE_TEST_CASE_P( DBBlockCachePinningTest, DBBlockCachePinningTest, diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index abe7f247610..8badcd841f6 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -19,6 +19,7 @@ #include "port/stack_trace.h" #include "rocksdb/advanced_options.h" #include "rocksdb/convenience.h" +#include "rocksdb/experimental.h" #include "rocksdb/filter_policy.h" #include "rocksdb/perf_context.h" #include "rocksdb/statistics.h" @@ -77,7 +78,7 @@ class DBBloomFilterTestWithParam DBBloomFilterTestWithParam() : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {} - ~DBBloomFilterTestWithParam() override {} + ~DBBloomFilterTestWithParam() override = default; void SetUp() override { bfp_impl_ = std::get<0>(GetParam()); @@ -2050,7 +2051,7 @@ class DBBloomFilterTestVaryPrefixAndFormatVer DBBloomFilterTestVaryPrefixAndFormatVer() : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {} - ~DBBloomFilterTestVaryPrefixAndFormatVer() override {} + ~DBBloomFilterTestVaryPrefixAndFormatVer() override = default; void SetUp() override { use_prefix_ = std::get<0>(GetParam()); @@ -2125,8 +2126,9 @@ TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) { values[i] = PinnableSlice(); } - db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0], - /*timestamps=*/nullptr, &statuses[0], true); + db_->MultiGet(ropts, Q, column_families.data(), key_slices.data(), + values.data(), + /*timestamps=*/nullptr, statuses.data(), true); // Confirm correct status results uint32_t number_not_found = 0; @@ -2176,8 +2178,9 @@ TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) { values[i] = PinnableSlice(); } - db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0], - /*timestamps=*/nullptr, &statuses[0], true); + db_->MultiGet(ropts, Q, column_families.data(), key_slices.data(), + values.data(), + /*timestamps=*/nullptr, statuses.data(), true); // Confirm correct status results uint32_t number_not_found = 0; @@ -3612,6 +3615,277 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) { } } +TEST_F(DBBloomFilterTest, SstQueryFilter) { + using experimental::KeySegmentsExtractor; + using experimental::MakeSharedBytewiseMinMaxSQFC; + using experimental::SelectKeySegment; + using experimental::SstQueryFilterConfigs; + using experimental::SstQueryFilterConfigsManager; + using KeyCategorySet = KeySegmentsExtractor::KeyCategorySet; + + struct MySegmentExtractor : public KeySegmentsExtractor { + char min_first_char; + char max_first_char; + char delim_char; + MySegmentExtractor(char _min_first_char, char _max_first_char, + char _delim_char) + : min_first_char(_min_first_char), + max_first_char(_max_first_char), + delim_char(_delim_char) {} + + const char* Name() const override { return "MySegmentExtractor"; } + + std::string GetId() const override { + return std::string("MySegmentExtractor+") + min_first_char + + max_first_char + delim_char; + } + + void Extract(const Slice& key_or_bound, KeyKind /*kind*/, + Result* result) const override { + size_t len = key_or_bound.size(); + if (len == 0) { + result->category = KeySegmentsExtractor::kReservedLowCategory; + } else if (static_cast(key_or_bound[0]) < + static_cast(min_first_char)) { + result->category = KeySegmentsExtractor::kReservedLowCategory; + } else if (static_cast(key_or_bound[0]) > + static_cast(max_first_char)) { + result->category = KeySegmentsExtractor::kReservedHighCategory; + } + for (uint32_t i = 0; i < len; ++i) { + if (key_or_bound[i] == delim_char || i + 1 == key_or_bound.size()) { + result->segment_ends.push_back(i + 1); + } + } + } + }; + + // Use '_' as delimiter, but different spans for default category + auto extractor_to_c = std::make_shared('a', 'c', '_'); + auto extractor_to_z = std::make_shared('a', 'z', '_'); + auto extractor_alt = std::make_shared('0', '9', '_'); + + // Filter on 2nd field, only for default category + auto filter1_def = MakeSharedBytewiseMinMaxSQFC( + experimental::SelectKeySegment(1), + KeyCategorySet{KeySegmentsExtractor::kDefaultCategory}); + + // Also filter on 3rd field regardless of category + auto filter2_all = + MakeSharedBytewiseMinMaxSQFC(experimental::SelectKeySegment(2)); + + SstQueryFilterConfigs configs1 = {{filter1_def, filter2_all}, extractor_to_c}; + SstQueryFilterConfigs configs2 = {{filter1_def, filter2_all}, extractor_to_z}; + SstQueryFilterConfigs configs3 = {{filter2_all}, extractor_alt}; + + SstQueryFilterConfigsManager::Data data = { + {42, {{"foo", configs1}}}, {43, {{"foo", configs2}, {"bar", configs3}}}}; + + std::shared_ptr configs_manager; + ASSERT_OK(SstQueryFilterConfigsManager::MakeShared(data, &configs_manager)); + + // Test manager behaviors + auto MakeFactory = [configs_manager]( + const std::string& configs_name, + SstQueryFilterConfigsManager::FilteringVersion ver) + -> std::shared_ptr { + std::shared_ptr factory; + Status s = configs_manager->MakeSharedFactory(configs_name, ver, &factory); + assert(s.ok()); + return factory; + }; + std::shared_ptr factory; + + // Version 0 is always OK, returning empty/not found configs + ASSERT_TRUE(MakeFactory("blah", 0)->GetConfigs().IsEmptyNotFound()); + ASSERT_TRUE(MakeFactory("foo", 0)->GetConfigs().IsEmptyNotFound()); + ASSERT_TRUE(MakeFactory("bar", 0)->GetConfigs().IsEmptyNotFound()); + + // We can't be sure about the proper configuration for versions outside the + // known range (and reserved version 0). + ASSERT_TRUE(configs_manager->MakeSharedFactory("foo", 1, &factory) + .IsInvalidArgument()); + ASSERT_TRUE(configs_manager->MakeSharedFactory("foo", 41, &factory) + .IsInvalidArgument()); + ASSERT_TRUE(configs_manager->MakeSharedFactory("foo", 44, &factory) + .IsInvalidArgument()); + ASSERT_TRUE(configs_manager->MakeSharedFactory("foo", 500, &factory) + .IsInvalidArgument()); + + ASSERT_TRUE(MakeFactory("blah", 42)->GetConfigs().IsEmptyNotFound()); + ASSERT_TRUE(MakeFactory("blah", 43)->GetConfigs().IsEmptyNotFound()); + ASSERT_FALSE(MakeFactory("foo", 42)->GetConfigs().IsEmptyNotFound()); + ASSERT_FALSE(MakeFactory("foo", 43)->GetConfigs().IsEmptyNotFound()); + ASSERT_TRUE(MakeFactory("bar", 42)->GetConfigs().IsEmptyNotFound()); + ASSERT_FALSE(MakeFactory("bar", 43)->GetConfigs().IsEmptyNotFound()); + + ASSERT_OK(configs_manager->MakeSharedFactory("foo", 42, &factory)); + ASSERT_EQ(factory->GetConfigsName(), "foo"); + ASSERT_EQ(factory->GetConfigs().IsEmptyNotFound(), false); + + Options options = CurrentOptions(); + options.statistics = CreateDBStatistics(); + options.table_properties_collector_factories.push_back(factory); + + DestroyAndReopen(options); + + // For lower level file + ASSERT_OK(Put(" ", "val0")); + ASSERT_OK(Put(" _345_678", "val0")); + ASSERT_OK(Put("aaa", "val0")); + ASSERT_OK(Put("abc_123", "val1")); + ASSERT_OK(Put("abc_13", "val2")); + ASSERT_OK(Put("abc_156_987", "val3")); + ASSERT_OK(Put("bcd_1722", "val4")); + ASSERT_OK(Put("xyz_145", "val5")); + ASSERT_OK(Put("xyz_167", "val6")); + ASSERT_OK(Put("xyz_178", "val7")); + ASSERT_OK(Put("zzz", "val0")); + ASSERT_OK(Put("~~~", "val0")); + ASSERT_OK(Put("~~~_456_789", "val0")); + + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + ASSERT_EQ(factory->GetFilteringVersion(), 42U); + ASSERT_NOK(factory->SetFilteringVersion(41)); + ASSERT_NOK(factory->SetFilteringVersion(44)); + ASSERT_EQ(factory->GetFilteringVersion(), 42U); + ASSERT_OK(factory->SetFilteringVersion(43)); + ASSERT_EQ(factory->GetFilteringVersion(), 43U); + + // For higher level file + ASSERT_OK(Put(" ", "val0")); + ASSERT_OK(Put(" _345_680", "val0")); + ASSERT_OK(Put("aaa", "val9")); + ASSERT_OK(Put("abc_234", "val1")); + ASSERT_OK(Put("abc_245_567", "val2")); + ASSERT_OK(Put("abc_25", "val3")); + ASSERT_OK(Put("xyz_180", "val4")); + ASSERT_OK(Put("xyz_191", "val4")); + ASSERT_OK(Put("xyz_260", "val4")); + ASSERT_OK(Put("zzz", "val9")); + ASSERT_OK(Put("~~~", "val0")); + ASSERT_OK(Put("~~~_456_790", "val0")); + + ASSERT_OK(Flush()); + + using Keys = std::vector; + auto RangeQueryKeys = + [factory, db = db_]( + std::string lb, std::string ub, + std::shared_ptr alt_factory = + nullptr) { + Slice lb_slice = lb; + Slice ub_slice = ub; + + ReadOptions ro; + ro.iterate_lower_bound = &lb_slice; + ro.iterate_upper_bound = &ub_slice; + ro.table_filter = (alt_factory ? alt_factory : factory) + ->GetTableFilterForRangeQuery(lb_slice, ub_slice); + auto it = db->NewIterator(ro); + Keys ret; + for (it->Seek(lb_slice); it->Valid(); it->Next()) { + ret.push_back(it->key().ToString()); + } + EXPECT_OK(it->status()); + delete it; + return ret; + }; + + // Control 1: range is not filtered but min/max filter is checked + // because of common prefix leading up to 2nd segment + // TODO/future: statistics for when filter is checked vs. not applicable + EXPECT_EQ(RangeQueryKeys("abc_150", "abc_249"), + Keys({"abc_156_987", "abc_234", "abc_245_567"})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 2); + + // Test 1: range is filtered to just lowest level, fully containing the + // segments in that category + EXPECT_EQ(RangeQueryKeys("abc_100", "abc_179"), + Keys({"abc_123", "abc_13", "abc_156_987"})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 1); + + // Test 2: range is filtered to just lowest level, partial overlap + EXPECT_EQ(RangeQueryKeys("abc_1500_x_y", "abc_16QQ"), Keys({"abc_156_987"})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 1); + + // Test 3: range is filtered to just highest level, fully containing the + // segments in that category but would be overlapping the range for the other + // file if the filter included all categories + EXPECT_EQ(RangeQueryKeys("abc_200", "abc_300"), + Keys({"abc_234", "abc_245_567", "abc_25"})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 1); + + // Test 4: range is filtered to just highest level, partial overlap (etc.) + EXPECT_EQ(RangeQueryKeys("abc_200", "abc_249"), + Keys({"abc_234", "abc_245_567"})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 1); + + // Test 5: range is filtered from both levels, because of category scope + EXPECT_EQ(RangeQueryKeys("abc_300", "abc_400"), Keys({})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0); + + // Control 2: range is not filtered because association between 1st and + // 2nd segment is not represented + EXPECT_EQ(RangeQueryKeys("abc_170", "abc_190"), Keys({})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 2); + + // Control 3: range is not filtered because there's no (bloom) filter on + // 1st segment (like prefix filtering) + EXPECT_EQ(RangeQueryKeys("baa_170", "baa_190"), Keys({})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 2); + + // Control 4: range is not filtered because difference in segments leading + // up to 2nd segment + EXPECT_EQ(RangeQueryKeys("abc_500", "abd_501"), Keys({})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 2); + + // TODO: exclusive upper bound tests + + // ======= Testing 3rd segment (cross-category filter) ======= + // Control 5: not filtered because of segment range overlap + EXPECT_EQ(RangeQueryKeys(" z__700", " z__750"), Keys({})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 2); + + // Test 6: filtered on both levels + EXPECT_EQ(RangeQueryKeys(" z__100", " z__300"), Keys({})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0); + + // Control 6: finding something, with 2nd segment filter helping + EXPECT_EQ(RangeQueryKeys("abc_156_9", "abc_156_99"), Keys({"abc_156_987"})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 1); + + EXPECT_EQ(RangeQueryKeys("abc_245_56", "abc_245_57"), Keys({"abc_245_567"})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 1); + + // Test 6: filtered on both levels, for different segments + EXPECT_EQ(RangeQueryKeys("abc_245_900", "abc_245_999"), Keys({})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0); + + // ======= Testing extractor read portability ======= + EXPECT_EQ(RangeQueryKeys("abc_300", "abc_400"), Keys({})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0); + + // Only modifies how filters are written + ASSERT_OK(factory->SetFilteringVersion(0)); + ASSERT_EQ(factory->GetFilteringVersion(), 0U); + ASSERT_EQ(factory->GetConfigs().IsEmptyNotFound(), true); + + EXPECT_EQ(RangeQueryKeys("abc_300", "abc_400"), Keys({})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0); + + // Even a different config name with different extractor can read + EXPECT_EQ(RangeQueryKeys("abc_300", "abc_400", MakeFactory("bar", 43)), + Keys({})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0); + + // Or a "not found" config name + EXPECT_EQ(RangeQueryKeys("abc_300", "abc_400", MakeFactory("blah", 43)), + Keys({})); + EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0); +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc index 44c406c4965..d84e24c41cb 100644 --- a/db/db_compaction_filter_test.cc +++ b/db/db_compaction_filter_test.cc @@ -150,7 +150,7 @@ class ConditionalFilter : public CompactionFilter { class ChangeFilter : public CompactionFilter { public: - explicit ChangeFilter() {} + explicit ChangeFilter() = default; bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, std::string* new_value, bool* value_changed) const override { @@ -289,7 +289,7 @@ class ConditionalFilterFactory : public CompactionFilterFactory { class ChangeFilterFactory : public CompactionFilterFactory { public: - explicit ChangeFilterFactory() {} + explicit ChangeFilterFactory() = default; std::unique_ptr CreateCompactionFilter( const CompactionFilter::Context& /*context*/) override { @@ -342,7 +342,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { { InternalKeyComparator icmp(options.comparator); ReadOptions read_options; - ScopedArenaIterator iter(dbfull()->NewInternalIterator( + ScopedArenaPtr iter(dbfull()->NewInternalIterator( read_options, &arena, kMaxSequenceNumber, handles_[1])); iter->SeekToFirst(); ASSERT_OK(iter->status()); @@ -434,7 +434,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { { InternalKeyComparator icmp(options.comparator); ReadOptions read_options; - ScopedArenaIterator iter(dbfull()->NewInternalIterator( + ScopedArenaPtr iter(dbfull()->NewInternalIterator( read_options, &arena, kMaxSequenceNumber, handles_[1])); iter->SeekToFirst(); ASSERT_OK(iter->status()); @@ -717,8 +717,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { Arena arena; InternalKeyComparator icmp(options.comparator); ReadOptions read_options; - ScopedArenaIterator iter(dbfull()->NewInternalIterator(read_options, &arena, - kMaxSequenceNumber)); + ScopedArenaPtr iter(dbfull()->NewInternalIterator( + read_options, &arena, kMaxSequenceNumber)); iter->SeekToFirst(); ASSERT_OK(iter->status()); while (iter->Valid()) { diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 2d71231173b..612a1f21d60 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -41,7 +41,7 @@ class CompactionStatsCollector : public EventListener { } } - ~CompactionStatsCollector() override {} + ~CompactionStatsCollector() override = default; void OnCompactionCompleted(DB* /* db */, const CompactionJobInfo& info) override { @@ -76,7 +76,7 @@ class CompactionStatsCollector : public EventListener { class DBCompactionTest : public DBTestBase { public: DBCompactionTest() - : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {} + : DBTestBase("db_compaction_test", /*env_do_fsync=*/false) {} protected: /* @@ -121,7 +121,7 @@ class DBCompactionTestWithParam public testing::WithParamInterface> { public: DBCompactionTestWithParam() - : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { + : DBTestBase("db_compaction_test", /*env_do_fsync=*/false) { max_subcompactions_ = std::get<0>(GetParam()); exclusive_manual_compaction_ = std::get<1>(GetParam()); } @@ -140,7 +140,7 @@ class DBCompactionTestWithBottommostParam std::tuple> { public: DBCompactionTestWithBottommostParam() - : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { + : DBTestBase("db_compaction_test", /*env_do_fsync=*/false) { bottommost_level_compaction_ = std::get<0>(GetParam()); } @@ -160,7 +160,7 @@ class DBCompactionWaitForCompactTest std::tuple> { public: DBCompactionWaitForCompactTest() - : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { + : DBTestBase("db_compaction_test", /*env_do_fsync=*/false) { abort_on_pause_ = std::get<0>(GetParam()); flush_ = std::get<1>(GetParam()); close_db_ = std::get<2>(GetParam()); @@ -241,8 +241,8 @@ class RoundRobinSubcompactionsAgainstResources namespace { class FlushedFileCollector : public EventListener { public: - FlushedFileCollector() {} - ~FlushedFileCollector() override {} + FlushedFileCollector() = default; + ~FlushedFileCollector() override = default; void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { std::lock_guard lock(mutex_); @@ -252,7 +252,7 @@ class FlushedFileCollector : public EventListener { std::vector GetFlushedFiles() { std::lock_guard lock(mutex_); std::vector result; - for (auto fname : flushed_files_) { + for (const auto& fname : flushed_files_) { result.push_back(fname); } return result; @@ -505,7 +505,7 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "TableCache::FindTable:0", [&](void* arg) { assert(arg != nullptr); - bool no_io = *(reinterpret_cast(arg)); + bool no_io = *(static_cast(arg)); if (!no_io) { // filter out cases for table properties queries. num_table_cache_lookup++; @@ -681,7 +681,7 @@ TEST_F(DBCompactionTest, CompactRangeBottomPri) { int bottom_pri_count = 0; SyncPoint::GetInstance()->SetCallBack( "ThreadPoolImpl::Impl::BGThread:BeforeRun", [&](void* arg) { - Env::Priority* pri = reinterpret_cast(arg); + Env::Priority* pri = static_cast(arg); // First time is low pri pool in the test case. if (low_pri_count == 0 && bottom_pri_count == 0) { ASSERT_EQ(Env::Priority::LOW, *pri); @@ -845,6 +845,20 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) { options.memtable_factory.reset( test::NewSpecialSkipListFactory(kNumKeysPerFile)); + CreateAndReopenWithCF({"one", "two", "three"}, options); + + Random rnd(301); + for (int cf = 0; cf < 4; cf++) { + // Make a trivial L1 for L0 to compact into. L2 will be large so debt ratio + // will not cause compaction pressure. + ASSERT_OK(Put(cf, Key(0), rnd.RandomString(102400))); + ASSERT_OK(Flush(cf)); + MoveFilesToLevel(2, cf); + ASSERT_OK(Put(cf, Key(0), "")); + ASSERT_OK(Flush(cf)); + MoveFilesToLevel(1, cf); + } + // Block all threads in thread pool. const size_t kTotalTasks = 4; env_->SetBackgroundThreads(4, Env::LOW); @@ -855,9 +869,6 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) { sleeping_tasks[i].WaitUntilSleeping(); } - CreateAndReopenWithCF({"one", "two", "three"}, options); - - Random rnd(301); for (int cf = 0; cf < 4; cf++) { for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { for (int i = 0; i < kNumKeysPerFile; i++) { @@ -1859,13 +1870,60 @@ TEST_F(DBCompactionTest, ManualCompactionWithUnorderedWrite) { ASSERT_EQ(Get("foo"), "v2"); } -TEST_F(DBCompactionTest, DeleteFileRange) { +// Test params: +// 1) whether to enable user-defined timestamps. +class DBDeleteFileRangeTest : public DBTestBase, + public testing::WithParamInterface { + public: + DBDeleteFileRangeTest() + : DBTestBase("db_delete_file_range_test", /*env_do_fsync=*/true) {} + + void SetUp() override { enable_udt_ = GetParam(); } + + protected: + void PutKeyValue(const Slice& key, const Slice& value) { + if (enable_udt_) { + EXPECT_OK(db_->Put(WriteOptions(), key, min_ts_, value)); + } else { + EXPECT_OK(Put(key, value)); + } + } + + std::string GetValue(const std::string& key) { + ReadOptions roptions; + std::string result; + if (enable_udt_) { + roptions.timestamp = &min_ts_; + } + Status s = db_->Get(roptions, key, &result); + EXPECT_TRUE(s.ok()); + return result; + } + + Status MaybeGetValue(const std::string& key, std::string* result) { + ReadOptions roptions; + if (enable_udt_) { + roptions.timestamp = &min_ts_; + } + Status s = db_->Get(roptions, key, result); + EXPECT_TRUE(s.IsNotFound() || s.ok()); + return s; + } + + bool enable_udt_ = false; + Slice min_ts_ = MinU64Ts(); +}; + +TEST_P(DBDeleteFileRangeTest, DeleteFileRange) { Options options = CurrentOptions(); options.write_buffer_size = 10 * 1024 * 1024; options.max_bytes_for_level_multiplier = 2; options.num_levels = 4; options.level0_file_num_compaction_trigger = 3; options.max_background_compactions = 3; + if (enable_udt_) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + } DestroyAndReopen(options); int32_t value_size = 10 * 1024; // 10 KB @@ -1877,14 +1935,14 @@ TEST_F(DBCompactionTest, DeleteFileRange) { // file 1 [0 => 100] for (int32_t i = 0; i < 100; i++) { values[i] = rnd.RandomString(value_size); - ASSERT_OK(Put(Key(i), values[i])); + PutKeyValue(Key(i), values[i]); } ASSERT_OK(Flush()); // file 2 [100 => 300] for (int32_t i = 100; i < 300; i++) { values[i] = rnd.RandomString(value_size); - ASSERT_OK(Put(Key(i), values[i])); + PutKeyValue(Key(i), values[i]); } ASSERT_OK(Flush()); @@ -1900,7 +1958,7 @@ TEST_F(DBCompactionTest, DeleteFileRange) { // file 3 [ 0 => 200] for (int32_t i = 0; i < 200; i++) { values[i] = rnd.RandomString(value_size); - ASSERT_OK(Put(Key(i), values[i])); + PutKeyValue(Key(i), values[i]); } ASSERT_OK(Flush()); @@ -1912,7 +1970,7 @@ TEST_F(DBCompactionTest, DeleteFileRange) { ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } values[j] = rnd.RandomString(value_size); - ASSERT_OK(Put(Key(j), values[j])); + PutKeyValue(Key(j), values[j]); } } ASSERT_OK(Flush()); @@ -1937,11 +1995,10 @@ TEST_F(DBCompactionTest, DeleteFileRange) { int32_t deleted_count = 0; for (int32_t i = 0; i < 4300; i++) { if (i < 1000 || i > 2000) { - ASSERT_EQ(Get(Key(i)), values[i]); + ASSERT_EQ(GetValue(Key(i)), values[i]); } else { - ReadOptions roptions; std::string result; - Status s = db_->Get(roptions, Key(i), &result); + Status s = MaybeGetValue(Key(i), &result); ASSERT_TRUE(s.IsNotFound() || s.ok()); if (s.IsNotFound()) { deleted_count++; @@ -1971,7 +2028,7 @@ TEST_F(DBCompactionTest, DeleteFileRange) { for (int32_t i = 0; i < 4300; i++) { ReadOptions roptions; std::string result; - ASSERT_TRUE(db_->Get(roptions, Key(i), &result).IsNotFound()); + ASSERT_TRUE(MaybeGetValue(Key(i), &result).IsNotFound()); deleted_count2++; } ASSERT_GT(deleted_count2, deleted_count); @@ -1979,13 +2036,16 @@ TEST_F(DBCompactionTest, DeleteFileRange) { ASSERT_GT(old_num_files, new_num_files); } -TEST_F(DBCompactionTest, DeleteFilesInRanges) { +TEST_P(DBDeleteFileRangeTest, DeleteFilesInRanges) { Options options = CurrentOptions(); options.write_buffer_size = 10 * 1024 * 1024; options.max_bytes_for_level_multiplier = 2; options.num_levels = 4; options.max_background_compactions = 3; options.disable_auto_compactions = true; + if (enable_udt_) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + } DestroyAndReopen(options); int32_t value_size = 10 * 1024; // 10 KB @@ -1998,7 +2058,7 @@ TEST_F(DBCompactionTest, DeleteFilesInRanges) { for (auto j = 0; j < 100; j++) { auto k = i * 100 + j; values[k] = rnd.RandomString(value_size); - ASSERT_OK(Put(Key(k), values[k])); + PutKeyValue(Key(k), values[k]); } ASSERT_OK(Flush()); } @@ -2013,7 +2073,7 @@ TEST_F(DBCompactionTest, DeleteFilesInRanges) { for (auto i = 0; i < 10; i += 2) { for (auto j = 0; j < 100; j++) { auto k = i * 100 + j; - ASSERT_OK(Put(Key(k), values[k])); + PutKeyValue(Key(k), values[k]); } ASSERT_OK(Flush()); } @@ -2030,22 +2090,21 @@ TEST_F(DBCompactionTest, DeleteFilesInRanges) { Slice begin2(begin_str2), end2(end_str2); Slice begin3(begin_str3), end3(end_str3); std::vector ranges; - ranges.push_back(RangePtr(&begin1, &end1)); - ranges.push_back(RangePtr(&begin2, &end2)); - ranges.push_back(RangePtr(&begin3, &end3)); + ranges.emplace_back(&begin1, &end1); + ranges.emplace_back(&begin2, &end2); + ranges.emplace_back(&begin3, &end3); ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), ranges.data(), ranges.size())); ASSERT_EQ("0,3,7", FilesPerLevel(0)); // Keys [0, 300) should not exist. for (auto i = 0; i < 300; i++) { - ReadOptions ropts; std::string result; - auto s = db_->Get(ropts, Key(i), &result); + auto s = MaybeGetValue(Key(i), &result); ASSERT_TRUE(s.IsNotFound()); } for (auto i = 300; i < 1000; i++) { - ASSERT_EQ(Get(Key(i)), values[i]); + ASSERT_EQ(GetValue(Key(i)), values[i]); } } @@ -2058,25 +2117,24 @@ TEST_F(DBCompactionTest, DeleteFilesInRanges) { Slice begin2(begin_str2), end2(end_str2); Slice begin3(begin_str3), end3(end_str3); std::vector ranges; - ranges.push_back(RangePtr(&begin1, &end1)); - ranges.push_back(RangePtr(&begin2, &end2)); - ranges.push_back(RangePtr(&begin3, &end3)); + ranges.emplace_back(&begin1, &end1); + ranges.emplace_back(&begin2, &end2); + ranges.emplace_back(&begin3, &end3); ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), ranges.data(), ranges.size(), false)); ASSERT_EQ("0,1,4", FilesPerLevel(0)); // Keys [600, 900) should not exist. for (auto i = 600; i < 900; i++) { - ReadOptions ropts; std::string result; - auto s = db_->Get(ropts, Key(i), &result); + auto s = MaybeGetValue(Key(i), &result); ASSERT_TRUE(s.IsNotFound()); } for (auto i = 300; i < 600; i++) { - ASSERT_EQ(Get(Key(i)), values[i]); + ASSERT_EQ(GetValue(Key(i)), values[i]); } for (auto i = 900; i < 1000; i++) { - ASSERT_EQ(Get(Key(i)), values[i]); + ASSERT_EQ(GetValue(Key(i)), values[i]); } } @@ -2087,15 +2145,14 @@ TEST_F(DBCompactionTest, DeleteFilesInRanges) { ASSERT_EQ("", FilesPerLevel(0)); for (auto i = 0; i < 1000; i++) { - ReadOptions ropts; std::string result; - auto s = db_->Get(ropts, Key(i), &result); + auto s = MaybeGetValue(Key(i), &result); ASSERT_TRUE(s.IsNotFound()); } } } -TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) { +TEST_P(DBDeleteFileRangeTest, DeleteFileRangeFileEndpointsOverlapBug) { // regression test for #2833: groups of files whose user-keys overlap at the // endpoints could be split by `DeleteFilesInRange`. This caused old data to // reappear, either because a new version of the key was removed, or a range @@ -2107,6 +2164,9 @@ TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) { Options options = CurrentOptions(); options.level0_file_num_compaction_trigger = kNumL0Files; options.target_file_size_base = 1 << 10; // 1KB + if (enable_udt_) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + } DestroyAndReopen(options); // The snapshot prevents key 1 from having its old version dropped. The low @@ -2130,8 +2190,8 @@ TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) { std::string vals[kNumL0Files]; for (int i = 0; i < kNumL0Files; ++i) { vals[i] = rnd.RandomString(kValSize); - ASSERT_OK(Put(Key(i), vals[i])); - ASSERT_OK(Put(Key(i + 1), vals[i])); + PutKeyValue(Key(i), vals[i]); + PutKeyValue(Key(i + 1), vals[i]); ASSERT_OK(Flush()); if (i == 0) { snapshot = db_->GetSnapshot(); @@ -2144,11 +2204,14 @@ TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) { std::string begin_str = Key(0), end_str = Key(1); Slice begin = begin_str, end = end_str; ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); - ASSERT_EQ(vals[1], Get(Key(1))); + ASSERT_EQ(vals[1], GetValue(Key(1))); db_->ReleaseSnapshot(snapshot); } +INSTANTIATE_TEST_CASE_P(DBDeleteFileRangeTest, DBDeleteFileRangeTest, + ::testing::Bool()); + TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) { int32_t trivial_move = 0; int32_t non_trivial_move = 0; @@ -4181,7 +4244,7 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) { ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kBottommostFiles); }); @@ -4237,7 +4300,7 @@ TEST_F(DBCompactionTest, DelayCompactBottomLevelFilesWithDeletions) { std::atomic_int compaction_count = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kBottommostFiles); compaction_count++; @@ -4368,7 +4431,7 @@ TEST_F(DBCompactionTest, RoundRobinTtlCompactionNormal) { SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); auto compaction_reason = compaction->compaction_reason(); if (compaction_reason == CompactionReason::kTtl) { ttl_compactions++; @@ -4518,7 +4581,7 @@ TEST_F(DBCompactionTest, RoundRobinTtlCompactionUnsortedTime) { SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); auto compaction_reason = compaction->compaction_reason(); if (compaction_reason == CompactionReason::kTtl) { ttl_compactions++; @@ -4634,7 +4697,7 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { ASSERT_OK(Flush()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -4682,7 +4745,7 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -4709,7 +4772,6 @@ TEST_F(DBCompactionTest, LevelTtlCompactionOutputCuttingIteractingWithOther) { options.env = env_; options.target_file_size_base = 4 << 10; options.disable_auto_compactions = true; - options.level_compaction_dynamic_file_size = false; DestroyAndReopen(options); Random rnd(301); @@ -4810,7 +4872,7 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { int ttl_compactions = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); auto compaction_reason = compaction->compaction_reason(); if (compaction_reason == CompactionReason::kTtl) { ttl_compactions++; @@ -4958,7 +5020,7 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) { int periodic_compactions = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); auto compaction_reason = compaction->compaction_reason(); if (compaction_reason == CompactionReason::kPeriodicCompaction) { periodic_compactions++; @@ -5142,7 +5204,7 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) { bool set_creation_time_to_zero = true; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); auto compaction_reason = compaction->compaction_reason(); if (compaction_reason == CompactionReason::kPeriodicCompaction) { periodic_compactions++; @@ -5150,7 +5212,7 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) { }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) { - TableProperties* props = reinterpret_cast(arg); + TableProperties* props = static_cast(arg); if (set_file_creation_time_to_zero) { props->file_creation_time = 0; } @@ -5214,7 +5276,7 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) { int ttl_compactions = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); auto compaction_reason = compaction->compaction_reason(); if (compaction_reason == CompactionReason::kPeriodicCompaction) { periodic_compactions++; @@ -5397,7 +5459,7 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) { int periodic_compactions = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); auto compaction_reason = compaction->compaction_reason(); if (compaction_reason == CompactionReason::kPeriodicCompaction) { periodic_compactions++; @@ -6150,7 +6212,7 @@ class CompactionPriTest : public DBTestBase, public testing::WithParamInterface { public: CompactionPriTest() - : DBTestBase("compaction_pri_test", /*env_do_fsync=*/true) { + : DBTestBase("compaction_pri_test", /*env_do_fsync=*/false) { compaction_pri_ = GetParam(); } @@ -6270,26 +6332,30 @@ TEST_F(DBCompactionTest, PersistRoundRobinCompactCursor) { TEST_P(RoundRobinSubcompactionsAgainstPressureToken, PressureTokenTest) { const int kKeysPerBuffer = 100; + const int kNumSubcompactions = 2; + const int kFilesPerLevel = 50; Options options = CurrentOptions(); - options.num_levels = 4; + options.num_levels = 3; options.max_bytes_for_level_multiplier = 2; options.level0_file_num_compaction_trigger = 4; options.target_file_size_base = kKeysPerBuffer * 1024; options.compaction_pri = CompactionPri::kRoundRobin; - options.max_bytes_for_level_base = 8 * kKeysPerBuffer * 1024; + // Target size is chosen so that filling the level with `kFilesPerLevel` files + // will make it oversized by `kNumSubcompactions` files. + options.max_bytes_for_level_base = + (kFilesPerLevel - kNumSubcompactions) * kKeysPerBuffer * 1024; options.disable_auto_compactions = true; - // Setup 7 threads but limited subcompactions so that - // RoundRobin requires extra compactions from reserved threads + // Setup `kNumSubcompactions` threads but limited subcompactions so + // that RoundRobin requires extra compactions from reserved threads options.max_subcompactions = 1; - options.max_background_compactions = 7; + options.max_background_compactions = kNumSubcompactions; options.max_compaction_bytes = 100000000; DestroyAndReopen(options); - env_->SetBackgroundThreads(7, Env::LOW); + env_->SetBackgroundThreads(kNumSubcompactions, Env::LOW); Random rnd(301); - const std::vector files_per_level = {0, 15, 25}; for (int lvl = 2; lvl > 0; lvl--) { - for (int i = 0; i < files_per_level[lvl]; i++) { + for (int i = 0; i < kFilesPerLevel; i++) { for (int j = 0; j < kKeysPerBuffer; j++) { // Add (lvl-1) to ensure nearly equivallent number of files // in L2 are overlapped with fils selected to compact from @@ -6300,9 +6366,8 @@ TEST_P(RoundRobinSubcompactionsAgainstPressureToken, PressureTokenTest) { ASSERT_OK(Flush()); } MoveFilesToLevel(lvl); - ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0)); + ASSERT_EQ(kFilesPerLevel, NumTableFilesAtLevel(lvl, 0)); } - // 15 files in L1; 25 files in L2 // This is a variable for making sure the following callback is called // and the assertions in it are indeed excuted. @@ -6311,10 +6376,10 @@ TEST_P(RoundRobinSubcompactionsAgainstPressureToken, PressureTokenTest) { "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) { uint64_t num_planned_subcompactions = *(static_cast(arg)); if (grab_pressure_token_) { - // 7 files are selected for round-robin under auto + // `kNumSubcompactions` files are selected for round-robin under auto // compaction. The number of planned subcompaction is restricted by // the limited number of max_background_compactions - ASSERT_EQ(num_planned_subcompactions, 7); + ASSERT_EQ(num_planned_subcompactions, kNumSubcompactions); } else { ASSERT_EQ(num_planned_subcompactions, 1); } @@ -6576,7 +6641,7 @@ TEST_F(DBCompactionTest, RoundRobinCutOutputAtCompactCursor) { class NoopMergeOperator : public MergeOperator { public: - NoopMergeOperator() {} + NoopMergeOperator() = default; bool FullMergeV2(const MergeOperationInput& /*merge_in*/, MergeOperationOutput* merge_out) const override { @@ -7132,8 +7197,7 @@ TEST_F(DBCompactionTest, ConsistencyFailTest) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "VersionBuilder::CheckConsistency0", [&](void* arg) { - auto p = - reinterpret_cast*>(arg); + auto p = static_cast*>(arg); // just swap the two FileMetaData so that we hit error // in CheckConsistency funcion FileMetaData* temp = *(p->first); @@ -7170,8 +7234,7 @@ TEST_F(DBCompactionTest, ConsistencyFailTest2) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "VersionBuilder::CheckConsistency1", [&](void* arg) { - auto p = - reinterpret_cast*>(arg); + auto p = static_cast*>(arg); // just swap the two FileMetaData so that we hit error // in CheckConsistency funcion FileMetaData* temp = *(p->first); @@ -7984,7 +8047,7 @@ TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) { bool has_compaction = false; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); ASSERT_TRUE(compaction->max_subcompactions() == 10); has_compaction = true; }); @@ -8008,7 +8071,7 @@ TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); ASSERT_TRUE(compaction->max_subcompactions() == 2); has_compaction = true; }); @@ -8036,7 +8099,7 @@ TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) { bool has_compaction = false; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); ASSERT_TRUE(compaction->max_subcompactions() == 10); has_compaction = true; }); @@ -8059,7 +8122,7 @@ TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); ASSERT_TRUE(compaction->max_subcompactions() == 2); has_compaction = true; }); @@ -9101,66 +9164,104 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) { } TEST_F(DBCompactionTest, FIFOChangeTemperature) { - Options options = CurrentOptions(); - options.compaction_style = kCompactionStyleFIFO; - options.num_levels = 1; - options.max_open_files = -1; - options.level0_file_num_compaction_trigger = 2; - options.create_if_missing = true; - CompactionOptionsFIFO fifo_options; - fifo_options.file_temperature_age_thresholds = {{Temperature::kCold, 1000}}; - fifo_options.max_table_files_size = 100000000; - options.compaction_options_fifo = fifo_options; - env_->SetMockSleep(); - Reopen(options); + for (bool write_time_default : {false, true}) { + SCOPED_TRACE("write time default? " + std::to_string(write_time_default)); - int total_cold = 0; - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "NewWritableFile::FileOptions.temperature", [&](void* arg) { - Temperature temperature = *(static_cast(arg)); - if (temperature == Temperature::kCold) { - total_cold++; - } - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleFIFO; + options.num_levels = 1; + options.max_open_files = -1; + options.level0_file_num_compaction_trigger = 2; + options.create_if_missing = true; + CompactionOptionsFIFO fifo_options; + fifo_options.file_temperature_age_thresholds = {{Temperature::kCold, 1000}}; + fifo_options.max_table_files_size = 100000000; + options.compaction_options_fifo = fifo_options; + env_->SetMockSleep(); + if (write_time_default) { + options.default_write_temperature = Temperature::kWarm; + } + // Should be ignored (TODO: fail?) + options.last_level_temperature = Temperature::kHot; + Reopen(options); - // The file system does not support checksum handoff. The check - // will be ignored. - ASSERT_OK(Put(Key(0), "value1")); - env_->MockSleepForSeconds(800); - ASSERT_OK(Put(Key(2), "value2")); - ASSERT_OK(Flush()); + int total_cold = 0; + int total_warm = 0; + int total_hot = 0; + int total_unknown = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile::FileOptions.temperature", [&](void* arg) { + Temperature temperature = *(static_cast(arg)); + if (temperature == Temperature::kCold) { + total_cold++; + } else if (temperature == Temperature::kWarm) { + total_warm++; + } else if (temperature == Temperature::kHot) { + total_hot++; + } else { + assert(temperature == Temperature::kUnknown); + total_unknown++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - ASSERT_OK(Put(Key(0), "value1")); - env_->MockSleepForSeconds(800); - ASSERT_OK(Put(Key(2), "value2")); - ASSERT_OK(Flush()); + // The file system does not support checksum handoff. The check + // will be ignored. + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); - ASSERT_OK(Put(Key(0), "value1")); - env_->MockSleepForSeconds(800); - ASSERT_OK(Put(Key(2), "value2")); - ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); - ASSERT_OK(Put(Key(0), "value1")); - env_->MockSleepForSeconds(800); - ASSERT_OK(Put(Key(2), "value2")); - ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); + if (write_time_default) { + // Also test dynamic option change + ASSERT_OK(db_->SetOptions({{"default_write_temperature", "kHot"}})); + } - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); - ColumnFamilyMetaData metadata; - db_->GetColumnFamilyMetaData(&metadata); - ASSERT_EQ(4, metadata.file_count); - ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); - ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature); - ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[2].temperature); - ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[3].temperature); - ASSERT_EQ(2, total_cold); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); - Destroy(options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(4, metadata.file_count); + if (write_time_default) { + ASSERT_EQ(Temperature::kHot, metadata.levels[0].files[0].temperature); + ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[1].temperature); + // Includes obsolete/deleted files moved to cold + ASSERT_EQ(total_warm, 3); + ASSERT_EQ(total_hot, 1); + // Includes non-SST DB files + ASSERT_GT(total_unknown, 0); + } else { + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature); + ASSERT_EQ(total_warm, 0); + ASSERT_EQ(total_hot, 0); + // Includes non-SST DB files + ASSERT_GT(total_unknown, 4); + } + ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[2].temperature); + ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[3].temperature); + ASSERT_EQ(2, total_cold); + + Destroy(options); + } } TEST_F(DBCompactionTest, DisableMultiManualCompaction) { @@ -9636,6 +9737,7 @@ TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) { test::SleepingBackgroundTask sleeping_task_low; env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); TEST_SYNC_POINT( "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" @@ -9776,7 +9878,7 @@ TEST_F(DBCompactionTest, TurnOnLevelCompactionDynamicLevelBytesUCToLC) { options.compaction_style = CompactionStyle::kCompactionStyleLevel; options.level_compaction_dynamic_level_bytes = true; ReopenWithColumnFamilies({"default", "pikachu"}, options); - std::string expected_lsm = ""; + std::string expected_lsm; for (int i = 0; i < 49; ++i) { expected_lsm += "0,"; } @@ -10184,8 +10286,7 @@ TEST_F(DBCompactionTest, ErrorWhenReadFileHead) { SyncPoint::GetInstance()->SetCallBack( "RandomAccessFileReader::Read::BeforeReturn", [&count, &error_file](void* pair_ptr) { - auto p = - reinterpret_cast*>(pair_ptr); + auto p = static_cast*>(pair_ptr); int cur = ++count; if (cur == error_file) { IOStatus* io_s = p->second; @@ -10293,20 +10394,20 @@ TEST_F(DBCompactionTest, ReleaseCompactionDuringManifestWrite) { SyncPoint::GetInstance()->EnableProcessing(); std::vector threads; - threads.emplace_back(std::thread([&]() { + threads.emplace_back([&]() { std::string k1_str = Key(1); std::string k2_str = Key(2); Slice k1 = k1_str; Slice k2 = k2_str; ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &k1, &k2)); - })); - threads.emplace_back(std::thread([&]() { + }); + threads.emplace_back([&]() { std::string k10_str = Key(10); std::string k11_str = Key(11); Slice k10 = k10_str; Slice k11 = k11_str; ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &k10, &k11)); - })); + }); std::string k100_str = Key(100); std::string k101_str = Key(101); Slice k100 = k100_str; diff --git a/db/db_dynamic_level_test.cc b/db/db_dynamic_level_test.cc index a1c2fa943a3..6bf7c7063df 100644 --- a/db/db_dynamic_level_test.cc +++ b/db/db_dynamic_level_test.cc @@ -338,7 +338,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) { std::set output_levels; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionPicker::CompactRange:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); output_levels.insert(compaction->output_level()); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 40e7ac15548..cbc2db14f24 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -121,7 +121,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { // DisableFileDeletions / EnableFileDeletions not supported in read-only DB if (deletions_disabled.ok()) { - Status s2 = EnableFileDeletions(/*force=*/false); + Status s2 = EnableFileDeletions(); assert(s2.ok()); s2.PermitUncheckedError(); } else { @@ -390,8 +390,11 @@ Status DBImpl::GetLiveFilesStorageInfo( info.file_number = live_wal_files[i]->LogNumber(); info.file_type = kWalFile; info.size = live_wal_files[i]->SizeFileBytes(); - // Only last should need to be trimmed - info.trim_to_size = (i + 1 == wal_size); + // Trim the log either if its the last one, or log file recycling is + // enabled. In the latter case, a hard link doesn't prevent the file + // from being renamed and recycled. So we need to copy it instead. + info.trim_to_size = (i + 1 == wal_size) || + (immutable_db_options_.recycle_log_file_num > 0); if (opts.include_checksum_info) { info.file_checksum_func_name = kUnknownFileChecksumFuncName; info.file_checksum = kUnknownFileChecksum; diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index 515d24f13d4..8df111eda33 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -273,7 +273,7 @@ TEST_F(DBFlushTest, ScheduleOnlyOneBgThread) { SyncPoint::GetInstance()->SetCallBack( "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0", [&](void* arg) { ASSERT_NE(nullptr, arg); - auto unscheduled_flushes = *reinterpret_cast(arg); + auto unscheduled_flushes = *static_cast(arg); ASSERT_EQ(0, unscheduled_flushes); ++called; }); @@ -1368,14 +1368,15 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) { ASSERT_OK(iter->status()); key = (iter->key()).ToString(false); value = (iter->value()).ToString(false); - if (key.compare(KEY3) == 0) + if (key.compare(KEY3) == 0) { ASSERT_EQ(value, p_v3b); - else if (key.compare(KEY4) == 0) + } else if (key.compare(KEY4) == 0) { ASSERT_EQ(value, p_v4); - else if (key.compare(KEY5) == 0) + } else if (key.compare(KEY5) == 0) { ASSERT_EQ(value, p_v5); - else + } else { ASSERT_EQ(value, NOT_FOUND); + } count++; } ASSERT_OK(iter->status()); @@ -1405,22 +1406,25 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) { ASSERT_OK(iter->status()); key = (iter->key()).ToString(false); value = (iter->value()).ToString(false); - if (key.compare(KEY2) == 0) + if (key.compare(KEY2) == 0) { ASSERT_EQ(value, p_v2); - else if (key.compare(KEY3) == 0) + } else if (key.compare(KEY3) == 0) { ASSERT_EQ(value, p_v3b); - else if (key.compare(KEY4) == 0) + } else if (key.compare(KEY4) == 0) { ASSERT_EQ(value, p_v4); - else if (key.compare(KEY5) == 0) + } else if (key.compare(KEY5) == 0) { ASSERT_EQ(value, p_v5); - else + } else { ASSERT_EQ(value, NOT_FOUND); + } count++; } // Expected count here is 4: KEY2, KEY3, KEY4, KEY5. ASSERT_EQ(count, EXPECTED_COUNT_END); - if (iter) delete iter; + if (iter) { + delete iter; + } Close(); } @@ -1792,7 +1796,7 @@ TEST_F(DBFlushTest, MemPurgeCorrectLogNumberAndSSTFileCreation) { std::atomic num_memtable_at_first_flush(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "FlushJob::WriteLevel0Table:num_memtables", [&](void* arg) { - uint64_t* mems_size = reinterpret_cast(arg); + uint64_t* mems_size = static_cast(arg); // atomic_compare_exchange_strong sometimes updates the value // of ZERO (the "expected" object), so we make sure ZERO is indeed... // zero. @@ -2039,7 +2043,7 @@ TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) { SyncPoint::GetInstance()->SetCallBack( "FlushJob::WriteLevel0Table", [&listener](void* arg) { // Wait for the second flush finished, out of mutex. - auto* mems = reinterpret_cast*>(arg); + auto* mems = static_cast*>(arg); if (mems->front()->GetEarliestSequenceNumber() == listener->seq1 - 1) { TEST_SYNC_POINT( "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:" @@ -2387,7 +2391,7 @@ TEST_F(DBFlushTest, PickRightMemtables) { }); SyncPoint::GetInstance()->SetCallBack( "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", [&](void* arg) { - auto* job = reinterpret_cast(arg); + auto* job = static_cast(arg); assert(job); const auto& mems = job->GetMemTables(); assert(mems.size() == 1); @@ -2500,7 +2504,7 @@ TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) { class SimpleTestFlushListener : public EventListener { public: explicit SimpleTestFlushListener(DBFlushTest* _test) : test_(_test) {} - ~SimpleTestFlushListener() override {} + ~SimpleTestFlushListener() override = default; void OnFlushBegin(DB* db, const FlushJobInfo& info) override { ASSERT_EQ(static_cast(0), info.cf_id); @@ -2635,7 +2639,7 @@ TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) { // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST. cfs.push_back(kDefaultColumnFamilyName); ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); - DBImpl* db_impl = reinterpret_cast(db_); + DBImpl* db_impl = static_cast(db_); ASSERT_TRUE(db_impl->allow_2pc()); ASSERT_NE(db_impl->MinLogNumberToKeep(), 0); } @@ -3036,6 +3040,39 @@ TEST_P(DBAtomicFlushTest, RollbackAfterFailToInstallResults) { SyncPoint::GetInstance()->ClearAllCallBacks(); } +TEST_P(DBAtomicFlushTest, FailureInMultiCfAutomaticFlush) { + bool atomic_flush = GetParam(); + auto fault_injection_env = std::make_shared(env_); + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + const int kNumKeysTriggerFlush = 4; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysTriggerFlush)); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + for (size_t cf = 0; cf < handles_.size(); ++cf) { + ASSERT_OK(Put(static_cast(cf), "a", "value")); + } + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::ScheduleFlushes:PreSwitchMemtable", + [&](void* /*arg*/) { fault_injection_env->SetFilesystemActive(false); }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 1; i < kNumKeysTriggerFlush; ++i) { + ASSERT_OK(Put(0, "key" + std::to_string(i), "value" + std::to_string(i))); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + // Next write after failed flush should fail. + ASSERT_NOK(Put(0, "x", "y")); + fault_injection_env->SetFilesystemActive(true); + Close(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + // In atomic flush, concurrent bg flush threads commit to the MANIFEST in // serial, in the order of their picked memtables for each column family. // Only when a bg flush thread finds out that its memtables are the earliest @@ -3139,7 +3176,7 @@ TEST_P(DBAtomicFlushTest, BgThreadNoWaitAfterManifestError) { SyncPoint::GetInstance()->SetCallBack( "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) { - auto* ptr = reinterpret_cast(arg); + auto* ptr = static_cast(arg); assert(ptr); *ptr = IOStatus::IOError("Injected failure"); }); diff --git a/db/db_impl/compacted_db_impl.cc b/db/db_impl/compacted_db_impl.cc index 3b665ea26b3..0e92ffd232f 100644 --- a/db/db_impl/compacted_db_impl.cc +++ b/db/db_impl/compacted_db_impl.cc @@ -13,10 +13,6 @@ namespace ROCKSDB_NAMESPACE { -extern void MarkKeyMayExist(void* arg); -extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, - const Slice& v, bool hit_and_return); - CompactedDBImpl::CompactedDBImpl(const DBOptions& options, const std::string& dbname) : DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true, @@ -25,7 +21,7 @@ CompactedDBImpl::CompactedDBImpl(const DBOptions& options, version_(nullptr), user_comparator_(nullptr) {} -CompactedDBImpl::~CompactedDBImpl() {} +CompactedDBImpl::~CompactedDBImpl() = default; size_t CompactedDBImpl::FindFile(const Slice& key) { size_t right = files_.num_files - 1; @@ -37,12 +33,6 @@ size_t CompactedDBImpl::FindFile(const Slice& key) { files_.files); } -Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, - const Slice& key, PinnableSlice* value) { - return Get(options, /*column_family*/ nullptr, key, value, - /*timestamp*/ nullptr); -} - Status CompactedDBImpl::Get(const ReadOptions& _read_options, ColumnFamilyHandle*, const Slice& key, PinnableSlice* value, std::string* timestamp) { @@ -112,62 +102,59 @@ Status CompactedDBImpl::Get(const ReadOptions& _read_options, return Status::NotFound(); } -std::vector CompactedDBImpl::MultiGet( - const ReadOptions& options, const std::vector&, - const std::vector& keys, std::vector* values) { - return MultiGet(options, keys, values, /*timestamps*/ nullptr); -} - -std::vector CompactedDBImpl::MultiGet( - const ReadOptions& _read_options, const std::vector&, - const std::vector& keys, std::vector* values, - std::vector* timestamps) { +void CompactedDBImpl::MultiGet(const ReadOptions& _read_options, + size_t num_keys, + ColumnFamilyHandle** /*column_families*/, + const Slice* keys, PinnableSlice* values, + std::string* timestamps, Status* statuses, + const bool /*sorted_input*/) { assert(user_comparator_); - size_t num_keys = keys.size(); + Status s; if (_read_options.io_activity != Env::IOActivity::kUnknown && _read_options.io_activity != Env::IOActivity::kMultiGet) { - Status s = Status::InvalidArgument( + s = Status::InvalidArgument( "Can only call MultiGet with `ReadOptions::io_activity` is " "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); - return std::vector(num_keys, s); } ReadOptions read_options(_read_options); - if (read_options.io_activity == Env::IOActivity::kUnknown) { - read_options.io_activity = Env::IOActivity::kMultiGet; - } - - if (read_options.timestamp) { - Status s = - FailIfTsMismatchCf(DefaultColumnFamily(), *(read_options.timestamp)); - if (!s.ok()) { - return std::vector(num_keys, s); + if (s.ok()) { + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGet; } - if (read_options.timestamp->size() > 0) { - s = FailIfReadCollapsedHistory(cfd_, cfd_->GetSuperVersion(), - *(read_options.timestamp)); - if (!s.ok()) { - return std::vector(num_keys, s); + + if (read_options.timestamp) { + s = FailIfTsMismatchCf(DefaultColumnFamily(), *(read_options.timestamp)); + if (s.ok()) { + if (read_options.timestamp->size() > 0) { + s = FailIfReadCollapsedHistory(cfd_, cfd_->GetSuperVersion(), + *(read_options.timestamp)); + } } + } else { + s = FailIfCfHasTs(DefaultColumnFamily()); } - } else { - Status s = FailIfCfHasTs(DefaultColumnFamily()); - if (!s.ok()) { - return std::vector(num_keys, s); + } + + if (!s.ok()) { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = s; } + return; } // Clear the timestamps for returning results so that we can distinguish // between tombstone or key that has never been written if (timestamps) { - for (auto& ts : *timestamps) { - ts.clear(); + for (size_t i = 0; i < num_keys; ++i) { + timestamps[i].clear(); } } GetWithTimestampReadCallback read_cb(kMaxSequenceNumber); autovector reader_list; - for (const auto& key : keys) { + for (size_t i = 0; i < num_keys; ++i) { + const Slice& key = keys[i]; LookupKey lkey(key, kMaxSequenceNumber, read_options.timestamp); const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())]; if (user_comparator_->CompareWithoutTimestamp( @@ -181,30 +168,26 @@ std::vector CompactedDBImpl::MultiGet( reader_list.push_back(f.fd.table_reader); } } - std::vector statuses(num_keys, Status::NotFound()); - values->resize(num_keys); - if (timestamps) { - timestamps->resize(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = Status::NotFound(); } int idx = 0; for (auto* r : reader_list) { if (r != nullptr) { - PinnableSlice pinnable_val; - std::string& value = (*values)[idx]; + PinnableSlice& pinnable_val = values[idx]; LookupKey lkey(keys[idx], kMaxSequenceNumber, read_options.timestamp); - std::string* timestamp = timestamps ? &(*timestamps)[idx] : nullptr; + std::string* timestamp = timestamps ? ×tamps[idx] : nullptr; GetContext get_context( user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound, lkey.user_key(), &pinnable_val, /*columns=*/nullptr, user_comparator_->timestamp_size() > 0 ? timestamp : nullptr, nullptr, nullptr, true, nullptr, nullptr, nullptr, nullptr, &read_cb); - Status s = + Status status = r->Get(read_options, lkey.internal_key(), &get_context, nullptr); - assert(static_cast(idx) < statuses.size()); - if (!s.ok() && !s.IsNotFound()) { - statuses[idx] = s; + assert(static_cast(idx) < num_keys); + if (!status.ok() && !status.IsNotFound()) { + statuses[idx] = status; } else { - value.assign(pinnable_val.data(), pinnable_val.size()); if (get_context.State() == GetContext::kFound) { statuses[idx] = Status::OK(); } @@ -212,7 +195,6 @@ std::vector CompactedDBImpl::MultiGet( } ++idx; } - return statuses; } Status CompactedDBImpl::Init(const Options& options) { diff --git a/db/db_impl/compacted_db_impl.h b/db/db_impl/compacted_db_impl.h index e1c605e420b..03853a5dda1 100644 --- a/db/db_impl/compacted_db_impl.h +++ b/db/db_impl/compacted_db_impl.h @@ -26,33 +26,23 @@ class CompactedDBImpl : public DBImpl { // Implementations of the DB interface using DB::Get; - virtual Status Get(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override; - - Status Get(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value, std::string* timestamp) override; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + std::string* timestamp) override; using DB::MultiGet; // Note that CompactedDBImpl::MultiGet is not the optimized version of // MultiGet to use. // TODO: optimize CompactedDBImpl::MultiGet, see DBImpl::MultiGet for details. - virtual std::vector MultiGet( - const ReadOptions& options, const std::vector&, - const std::vector& keys, - std::vector* values) override; - - std::vector MultiGet(const ReadOptions& _read_options, - const std::vector&, - const std::vector& keys, - std::vector* values, - std::vector* timestamps) override; + void MultiGet(const ReadOptions& options, size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool sorted_input) override; using DBImpl::Put; - virtual Status Put(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/, const Slice& /*value*/) override { + Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { return Status::NotSupported("Not supported in compacted db mode."); } @@ -65,54 +55,53 @@ class CompactedDBImpl : public DBImpl { } using DBImpl::Merge; - virtual Status Merge(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/, const Slice& /*value*/) override { + Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { return Status::NotSupported("Not supported in compacted db mode."); } using DBImpl::Delete; - virtual Status Delete(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/) override { + Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { return Status::NotSupported("Not supported in compacted db mode."); } - virtual Status Write(const WriteOptions& /*options*/, - WriteBatch* /*updates*/) override { + Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { return Status::NotSupported("Not supported in compacted db mode."); } using DBImpl::CompactRange; - virtual Status CompactRange(const CompactRangeOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice* /*begin*/, - const Slice* /*end*/) override { + Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, const Slice* /*end*/) override { return Status::NotSupported("Not supported in compacted db mode."); } - virtual Status DisableFileDeletions() override { + Status DisableFileDeletions() override { return Status::NotSupported("Not supported in compacted db mode."); } - virtual Status EnableFileDeletions(bool /*force*/) override { + Status EnableFileDeletions() override { return Status::NotSupported("Not supported in compacted db mode."); } - virtual Status GetLiveFiles(std::vector& ret, - uint64_t* manifest_file_size, - bool /*flush_memtable*/) override { + Status GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool /*flush_memtable*/) override { return DBImpl::GetLiveFiles(ret, manifest_file_size, false /* flush_memtable */); } using DBImpl::Flush; - virtual Status Flush(const FlushOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/) override { + Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { return Status::NotSupported("Not supported in compacted db mode."); } - virtual Status SyncWAL() override { + Status SyncWAL() override { return Status::NotSupported("Not supported in compacted db mode."); } using DB::IngestExternalFile; - virtual Status IngestExternalFile( + Status IngestExternalFile( ColumnFamilyHandle* /*column_family*/, const std::vector& /*external_files*/, const IngestExternalFileOptions& /*ingestion_options*/) override { @@ -120,7 +109,7 @@ class CompactedDBImpl : public DBImpl { } using DB::CreateColumnFamilyWithImport; - virtual Status CreateColumnFamilyWithImport( + Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& /*options*/, const std::string& /*column_family_name*/, const ImportColumnFamilyOptions& /*import_options*/, @@ -130,9 +119,9 @@ class CompactedDBImpl : public DBImpl { } using DB::ClipColumnFamily; - virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, - const Slice& /*begin*/, - const Slice& /*end*/) override { + Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { return Status::NotSupported("Not supported in compacted db mode."); } diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index c89c7120c80..1954fd433e3 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -8,7 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_impl/db_impl.h" -#include +#include #ifdef OS_SOLARIS #include #endif @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -44,7 +45,7 @@ #include "db/memtable.h" #include "db/memtable_list.h" #include "db/merge_context.h" -#include "db/merge_helper.h" +#include "db/multi_cf_iterator.h" #include "db/periodic_task_scheduler.h" #include "db/range_tombstone_fragmenter.h" #include "db/table_cache.h" @@ -72,6 +73,9 @@ #include "options/cf_options.h" #include "options/options_helper.h" #include "options/options_parser.h" +#ifdef ROCKSDB_JEMALLOC +#include "port/jemalloc_helper.h" +#endif #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" @@ -149,6 +153,12 @@ void DumpSupportInfo(Logger* logger) { crc32c::IsFastCrc32Supported().c_str()); ROCKS_LOG_HEADER(logger, "DMutex implementation: %s", DMutex::kName()); + + bool jemalloc_supported = false; +#ifdef ROCKSDB_JEMALLOC + jemalloc_supported = HasJemalloc(); +#endif + ROCKS_LOG_HEADER(logger, "Jemalloc supported: %d", jemalloc_supported); } } // namespace @@ -282,7 +292,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, dbname_, &immutable_db_options_, file_options_, table_cache_.get(), write_buffer_manager_, &write_controller_, &block_cache_tracer_, io_tracer_, db_id_, db_session_id_, options.daily_offpeak_time_utc, - &error_handler_)); + &error_handler_, read_only)); column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); @@ -311,7 +321,7 @@ Status DBImpl::Resume() { if (error_handler_.IsRecoveryInProgress()) { // Don't allow a mix of manual and automatic recovery - return Status::Busy(); + return Status::Busy("Recovery in progress"); } mutex_.Unlock(); @@ -335,8 +345,10 @@ Status DBImpl::Resume() { Status DBImpl::ResumeImpl(DBRecoverContext context) { mutex_.AssertHeld(); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + WaitForBackgroundWork(); Status s; @@ -375,8 +387,8 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { assert(cfh); ColumnFamilyData* cfd = cfh->cfd(); const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions(); - s = versions_->LogAndApply(cfd, cf_opts, read_options, &edit, &mutex_, - directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, cf_opts, read_options, write_options, + &edit, &mutex_, directories_.GetDbDir()); if (!s.ok()) { io_s = versions_->io_status(); if (!io_s.ok()) { @@ -659,6 +671,18 @@ Status DBImpl::CloseHelper() { // versions need to be destroyed before table_cache since it can hold // references to table_cache. + { + Status s = versions_->Close(directories_.GetDbDir(), &mutex_); + if (!s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unable to close MANIFEST with error -- %s", + s.ToString().c_str()); + if (ret.ok()) { + ret = s; + } + } + } + versions_.reset(); mutex_.Unlock(); if (db_lock_ != nullptr) { @@ -706,23 +730,26 @@ Status DBImpl::CloseHelper() { Status DBImpl::CloseImpl() { return CloseHelper(); } DBImpl::~DBImpl() { + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); + // TODO: remove this. init_logger_creation_s_.PermitUncheckedError(); InstrumentedMutexLock closing_lock_guard(&closing_mutex_); - if (closed_) { - return; - } + if (!closed_) { + closed_ = true; - closed_ = true; + { + const Status s = MaybeReleaseTimestampedSnapshotsAndCheck(); + s.PermitUncheckedError(); + } - { - const Status s = MaybeReleaseTimestampedSnapshotsAndCheck(); - s.PermitUncheckedError(); + closing_status_ = CloseImpl(); + closing_status_.PermitUncheckedError(); } - - closing_status_ = CloseImpl(); - closing_status_.PermitUncheckedError(); + ThreadStatusUtil::SetThreadOperation(cur_op_type); } void DBImpl::MaybeIgnoreError(Status* s) const { @@ -797,7 +824,9 @@ Status DBImpl::StartPeriodicTaskScheduler() { return s; } -Status DBImpl::RegisterRecordSeqnoTimeWorker(bool is_new_db) { +Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options, + const WriteOptions& write_options, + bool is_new_db) { options_mutex_.AssertHeld(); uint64_t min_preserve_seconds = std::numeric_limits::max(); @@ -817,9 +846,15 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(bool is_new_db) { } } if (min_preserve_seconds == std::numeric_limits::max()) { - seqno_to_time_mapping_.Resize(0, 0); + // Don't track + seqno_to_time_mapping_.SetCapacity(0); + seqno_to_time_mapping_.SetMaxTimeSpan(UINT64_MAX); } else { - seqno_to_time_mapping_.Resize(min_preserve_seconds, max_preserve_seconds); + uint64_t cap = std::min(kMaxSeqnoToTimeEntries, + max_preserve_seconds * kMaxSeqnoTimePairsPerCF / + min_preserve_seconds); + seqno_to_time_mapping_.SetCapacity(cap); + seqno_to_time_mapping_.SetMaxTimeSpan(max_preserve_seconds); } mapping_was_empty = seqno_to_time_mapping_.Empty(); } @@ -828,9 +863,8 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(bool is_new_db) { if (min_preserve_seconds != std::numeric_limits::max()) { // round up to 1 when the time_duration is smaller than // kMaxSeqnoTimePairsPerCF - seqno_time_cadence = (min_preserve_seconds + - SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF - 1) / - SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF; + seqno_time_cadence = (min_preserve_seconds + kMaxSeqnoTimePairsPerCF - 1) / + kMaxSeqnoTimePairsPerCF; } TEST_SYNC_POINT_CALLBACK( @@ -867,7 +901,7 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(bool is_new_db) { assert(mapping_was_empty); // We can simply modify these, before writes are allowed - constexpr uint64_t kMax = SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST; + constexpr uint64_t kMax = kMaxSeqnoTimePairsPerSST; versions_->SetLastAllocatedSequence(kMax); versions_->SetLastPublishedSequence(kMax); versions_->SetLastSequence(kMax); @@ -880,7 +914,8 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(bool is_new_db) { VersionEdit edit; edit.SetLastSequence(kMax); s = versions_->LogAndApplyToDefaultColumnFamily( - {}, &edit, &mutex_, directories_.GetDbDir()); + read_options, write_options, &edit, &mutex_, + directories_.GetDbDir()); if (!s.ok() && versions_->io_status().IsIOError()) { s = error_handler_.SetBGError(versions_->io_status(), BackgroundErrorReason::kManifestWrite); @@ -927,7 +962,9 @@ size_t DBImpl::EstimateInMemoryStatsHistorySize() const { stats_history_mutex_.AssertHeld(); size_t size_total = sizeof(std::map>); - if (stats_history_.size() == 0) return size_total; + if (stats_history_.size() == 0) { + return size_total; + } size_t size_per_slice = sizeof(uint64_t) + sizeof(std::map); // non-empty map, stats_history_.begin() guaranteed to exist @@ -990,6 +1027,7 @@ void DBImpl::PersistStats() { stats_slice_initialized_ = true; std::swap(stats_slice_, stats_map); if (s.ok()) { + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions wo; wo.low_pri = true; wo.no_slowdown = true; @@ -1052,7 +1090,9 @@ bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time, std::map* stats_map) { assert(new_time); assert(stats_map); - if (!new_time || !stats_map) return false; + if (!new_time || !stats_map) { + return false; + } // lock when search for start_time { InstrumentedMutexLock l(&stats_history_mutex_); @@ -1565,10 +1605,10 @@ Status DBImpl::ApplyReplicationLogRecord(ReplicationLogRecord record, if (!s.ok() || info->diverged_manifest_writes) { break; } - s = versions_->LogAndApply(cfds, mutable_cf_options_list, ReadOptions(), - edit_lists, &mutex_, directories_.GetDbDir(), - false /* new_descriptor_log */, - &*cf_options); + s = versions_->LogAndApply( + cfds, mutable_cf_options_list, ReadOptions(), WriteOptions(), + edit_lists, &mutex_, directories_.GetDbDir(), + false /* new_descriptor_log */, &*cf_options); if (!s.ok()) { break; } @@ -1751,8 +1791,10 @@ Status DBImpl::GetManifestUpdateSequence(uint64_t* out) { Status DBImpl::SetOptions( ColumnFamilyHandle* column_family, const std::unordered_map& options_map) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + auto* cfd = static_cast_with_check(column_family)->cfd(); if (options_map.empty()) { @@ -1784,15 +1826,17 @@ Status DBImpl::SetOptions( if (!only_set_disable_write_stall) { // Append new version to recompute compaction score. VersionEdit dummy_edit; - s = versions_->LogAndApply(cfd, new_options, read_options, &dummy_edit, - &mutex_, directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, new_options, read_options, + write_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); } // Trigger possible flush/compactions. This has to be before we persist // options to file, otherwise there will be a deadlock with writer // thread. InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options); - persist_options_status = WriteOptionsFile(true /*db_mutex_already_held*/); + persist_options_status = + WriteOptionsFile(write_options, true /*db_mutex_already_held*/); bg_cv_.SignalAll(); } } @@ -1971,7 +2015,8 @@ Status DBImpl::SetDBOptions( } write_thread_.ExitUnbatched(&w); } - persist_options_status = WriteOptionsFile(true /*db_mutex_already_held*/); + persist_options_status = + WriteOptionsFile(WriteOptions(), true /*db_mutex_already_held*/); } else { // To get here, we must have had invalid options and will not attempt to // persist the options, which means the status is "OK/Uninitialized. @@ -2012,7 +2057,9 @@ int DBImpl::FindMinimumEmptyLevelFitting( int minimum_level = level; for (int i = level - 1; i > 0; --i) { // stop if level i is not empty - if (vstorage->NumLevelFiles(i) > 0) break; + if (vstorage->NumLevelFiles(i) > 0) { + break; + } // stop if level i is too small (cannot fit the level files) if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) { break; @@ -2023,14 +2070,14 @@ int DBImpl::FindMinimumEmptyLevelFitting( return minimum_level; } -Status DBImpl::FlushWAL(bool sync) { +Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) { if (manual_wal_flush_) { IOStatus io_s; { // We need to lock log_write_mutex_ since logs_ might change concurrently InstrumentedMutexLock wl(&log_write_mutex_); log::Writer* cur_log_writer = logs_.back().writer; - io_s = cur_log_writer->WriteBuffer(); + io_s = cur_log_writer->WriteBuffer(write_options); } if (!io_s.ok()) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", @@ -2103,11 +2150,22 @@ Status DBImpl::SyncWAL() { RecordTick(stats_, WAL_FILE_SYNCED); Status status; IOStatus io_s; - for (log::Writer* log : logs_to_sync) { - io_s = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync); - if (!io_s.ok()) { - status = io_s; - break; + // TODO: plumb Env::IOActivity, Env::IOPriority + const ReadOptions read_options; + const WriteOptions write_options; + IOOptions opts; + io_s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (!io_s.ok()) { + status = io_s; + } + if (io_s.ok()) { + for (log::Writer* log : logs_to_sync) { + io_s = + log->file()->SyncWithoutFlush(opts, immutable_db_options_.use_fsync); + if (!io_s.ok()) { + status = io_s; + break; + } } } if (!io_s.ok()) { @@ -2136,9 +2194,7 @@ Status DBImpl::SyncWAL() { } if (status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - // TODO: plumb Env::IOActivity - const ReadOptions read_options; - status = ApplyWALToManifest(read_options, &synced_wals); + status = ApplyWALToManifest(read_options, write_options, &synced_wals); } TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); @@ -2147,12 +2203,14 @@ Status DBImpl::SyncWAL() { } Status DBImpl::ApplyWALToManifest(const ReadOptions& read_options, + const WriteOptions& write_options, VersionEdit* synced_wals) { // not empty, write to MANIFEST. mutex_.AssertHeld(); Status status = versions_->LogAndApplyToDefaultColumnFamily( - read_options, synced_wals, &mutex_, directories_.GetDbDir()); + read_options, write_options, synced_wals, &mutex_, + directories_.GetDbDir()); if (!status.ok() && versions_->io_status().IsIOError()) { status = error_handler_.SetBGError(versions_->io_status(), BackgroundErrorReason::kManifestWrite); @@ -2259,7 +2317,11 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, wal.GetPreSyncSize() > 0) { synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize())); } - if (wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize()) { + // Check if the file has been closed, i.e wal.writer->file() == nullptr + // which can happen if log recycling is enabled, or if all the data in + // the log has been synced + if (wal.writer->file() == nullptr || + wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize()) { // Fully synced logs_to_free_.push_back(wal.ReleaseWriter()); it = logs_.erase(it); @@ -2425,7 +2487,7 @@ struct SuperVersionHandle { }; static void CleanupSuperVersionHandle(void* arg1, void* /*arg2*/) { - SuperVersionHandle* sv_handle = reinterpret_cast(arg1); + SuperVersionHandle* sv_handle = static_cast(arg1); if (sv_handle->super_version->Unref()) { // Job id == 0 means that this is not our background process, but rather @@ -2482,7 +2544,8 @@ InternalIterator* DBImpl::NewInternalIterator( super_version->mutable_cf_options.prefix_extractor != nullptr, read_options.iterate_upper_bound); // Collect iterator for mutable memtable - auto mem_iter = super_version->mem->NewIterator(read_options, arena); + auto mem_iter = super_version->mem->NewIterator( + read_options, super_version->GetSeqnoToTimeMapping(), arena); Status s; if (!read_options.ignore_range_deletions) { TruncatedRangeDelIterator* mem_tombstone_iter = nullptr; @@ -2504,8 +2567,9 @@ InternalIterator* DBImpl::NewInternalIterator( // Collect all needed child iterators for immutable memtables if (s.ok()) { - super_version->imm->AddIterators(read_options, &merge_iter_builder, - !read_options.ignore_range_deletions); + super_version->imm->AddIterators( + read_options, super_version->GetSeqnoToTimeMapping(), + &merge_iter_builder, !read_options.ignore_range_deletions); } TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s); @@ -2553,12 +2617,6 @@ ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const { return persist_stats_cf_handle_; } -Status DBImpl::Get(const ReadOptions& read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) { - return Get(read_options, column_family, key, value, /*timestamp=*/nullptr); -} - Status DBImpl::GetImpl(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { @@ -2814,7 +2872,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, snapshot = get_impl_options.callback->max_visible_seq(); } else { snapshot = - reinterpret_cast(read_options.snapshot)->number_; + static_cast(read_options.snapshot)->number_; } } else { // Note that the snapshot is assigned AFTER referencing the super @@ -2859,6 +2917,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, // Prepare to store a list of merge operations if merge occurs. MergeContext merge_context; + merge_context.get_merge_operands_options = + get_impl_options.get_merge_operands_options; SequenceNumber max_covering_tombstone_seq = 0; Status s; @@ -3048,274 +3108,6 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, return s; } -std::vector DBImpl::MultiGet( - const ReadOptions& read_options, - const std::vector& column_family, - const std::vector& keys, std::vector* values) { - return MultiGet(read_options, column_family, keys, values, - /*timestamps=*/nullptr); -} - -std::vector DBImpl::MultiGet( - const ReadOptions& _read_options, - const std::vector& column_family, - const std::vector& keys, std::vector* values, - std::vector* timestamps) { - PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); - StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); - PERF_TIMER_GUARD(get_snapshot_time); - - size_t num_keys = keys.size(); - assert(column_family.size() == num_keys); - std::vector stat_list(num_keys); - - // RocksDB-Cloud contribution begin - auto super_snapshot = - dynamic_cast(_read_options.snapshot); - // RocksDB-Cloud contribution end - - if (_read_options.io_activity != Env::IOActivity::kUnknown && - _read_options.io_activity != Env::IOActivity::kMultiGet) { - Status s = Status::InvalidArgument( - "Can only call MultiGet with `ReadOptions::io_activity` is " - "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); - - for (size_t i = 0; i < num_keys; ++i) { - stat_list[i] = s; - } - return stat_list; - } - - ReadOptions read_options(_read_options); - if (read_options.io_activity == Env::IOActivity::kUnknown) { - read_options.io_activity = Env::IOActivity::kMultiGet; - } - - bool should_fail = false; - for (size_t i = 0; i < num_keys; ++i) { - assert(column_family[i]); - if (read_options.timestamp) { - stat_list[i] = - FailIfTsMismatchCf(column_family[i], *(read_options.timestamp)); - if (!stat_list[i].ok()) { - should_fail = true; - } - } else { - stat_list[i] = FailIfCfHasTs(column_family[i]); - if (!stat_list[i].ok()) { - should_fail = true; - } - } - // RocksDB-Cloud contribution begin - auto cfh = static_cast_with_check(column_family[i]); - auto cfd = cfh->cfd(); - if (super_snapshot && cfd->GetID() != super_snapshot->cfd()->GetID()) { - std::ostringstream oss; - oss << "[MultiGet] SuperSnapshot column family " - << super_snapshot->cfd()->GetName() - << " doesn't match provided column family " << cfd->GetName(); - stat_list[i] = Status::InvalidArgument(oss.str()); - should_fail = true; - } - // RocksDB-Cloud contribution end - } - - if (should_fail) { - for (auto& s : stat_list) { - if (s.ok()) { - s = Status::Incomplete( - "DB not queried due to invalid argument(s) in the same MultiGet"); - } - } - return stat_list; - } - - if (tracer_) { - // TODO: This mutex should be removed later, to improve performance when - // tracing is enabled. - InstrumentedMutexLock lock(&trace_mutex_); - if (tracer_) { - // TODO: maybe handle the tracing status? - tracer_->MultiGet(column_family, keys).PermitUncheckedError(); - } - } - - UnorderedMap multiget_cf_data( - column_family.size()); - for (auto cf : column_family) { - auto cfh = static_cast_with_check(cf); - auto cfd = cfh->cfd(); - if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { - multiget_cf_data.emplace(cfd->GetID(), - MultiGetColumnFamilyData(cfh, nullptr)); - } - } - - std::function::iterator&)> - iter_deref_lambda = - [](UnorderedMap::iterator& - cf_iter) { return &cf_iter->second; }; - - SequenceNumber consistent_seqnum; - bool sv_from_thread_local; - Status status = - MultiCFSnapshot>( - read_options, nullptr, iter_deref_lambda, &multiget_cf_data, - &consistent_seqnum, &sv_from_thread_local); - - if (!status.ok()) { - for (auto& s : stat_list) { - if (s.ok()) { - s = status; - } - } - return stat_list; - } - - TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum1"); - TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum2"); - - // Contain a list of merge operations if merge occurs. - MergeContext merge_context; - - // Note: this always resizes the values array - values->resize(num_keys); - if (timestamps) { - timestamps->resize(num_keys); - } - - // Keep track of bytes that we read for statistics-recording later - uint64_t bytes_read = 0; - PERF_TIMER_STOP(get_snapshot_time); - - // For each of the given keys, apply the entire "get" process as follows: - // First look in the memtable, then in the immutable memtable (if any). - // s is both in/out. When in, s could either be OK or MergeInProgress. - // merge_operands will contain the sequence of merges in the latter case. - size_t num_found = 0; - size_t keys_read; - uint64_t curr_value_size = 0; - - GetWithTimestampReadCallback timestamp_read_callback(0); - ReadCallback* read_callback = nullptr; - if (read_options.timestamp && read_options.timestamp->size() > 0) { - timestamp_read_callback.Refresh(consistent_seqnum); - read_callback = ×tamp_read_callback; - } - - for (keys_read = 0; keys_read < num_keys; ++keys_read) { - merge_context.Clear(); - Status& s = stat_list[keys_read]; - std::string* value = &(*values)[keys_read]; - std::string* timestamp = timestamps ? &(*timestamps)[keys_read] : nullptr; - - LookupKey lkey(keys[keys_read], consistent_seqnum, read_options.timestamp); - auto cfh = static_cast_with_check( - column_family[keys_read]); - SequenceNumber max_covering_tombstone_seq = 0; - auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); - assert(mgd_iter != multiget_cf_data.end()); - auto mgd = mgd_iter->second; - auto super_version = mgd.super_version; - bool skip_memtable = - (read_options.read_tier == kPersistedTier && - has_unpersisted_data_.load(std::memory_order_relaxed)); - bool done = false; - if (!skip_memtable) { - if (super_version->mem->Get( - lkey, value, /*columns=*/nullptr, timestamp, &s, &merge_context, - &max_covering_tombstone_seq, read_options, - false /* immutable_memtable */, read_callback)) { - done = true; - RecordTick(stats_, MEMTABLE_HIT); - } else if (super_version->imm->Get(lkey, value, /*columns=*/nullptr, - timestamp, &s, &merge_context, - &max_covering_tombstone_seq, - read_options, read_callback)) { - done = true; - RecordTick(stats_, MEMTABLE_HIT); - } - } - if (!done) { - PinnableSlice pinnable_val; - PERF_TIMER_GUARD(get_from_output_files_time); - PinnedIteratorsManager pinned_iters_mgr; - super_version->current->Get(read_options, lkey, &pinnable_val, - /*columns=*/nullptr, timestamp, &s, - &merge_context, &max_covering_tombstone_seq, - &pinned_iters_mgr, /*value_found=*/nullptr, - /*key_exists=*/nullptr, - /*seq=*/nullptr, read_callback); - value->assign(pinnable_val.data(), pinnable_val.size()); - RecordTick(stats_, MEMTABLE_MISS); - } - - if (s.ok()) { - const auto& merge_threshold = read_options.merge_operand_count_threshold; - if (merge_threshold.has_value() && - merge_context.GetNumOperands() > merge_threshold.value()) { - s = Status::OkMergeOperandThresholdExceeded(); - } - - bytes_read += value->size(); - num_found++; - - curr_value_size += value->size(); - if (curr_value_size > read_options.value_size_soft_limit) { - while (++keys_read < num_keys) { - stat_list[keys_read] = Status::Aborted(); - } - break; - } - } - if (read_options.deadline.count() && - immutable_db_options_.clock->NowMicros() > - static_cast(read_options.deadline.count())) { - break; - } - } - - if (keys_read < num_keys) { - // The only reason to break out of the loop is when the deadline is - // exceeded - assert(immutable_db_options_.clock->NowMicros() > - static_cast(read_options.deadline.count())); - for (++keys_read; keys_read < num_keys; ++keys_read) { - stat_list[keys_read] = Status::TimedOut(); - } - } - - // Post processing (decrement reference counts and record statistics) - PERF_TIMER_GUARD(get_post_process_time); - - // Only cleanup the super versions if we don't have super snapshot, which - // brought its own superversion. - // RocksDB-Cloud contribution begin - if (!dynamic_cast(read_options.snapshot)) { - // RocksDB-Cloud contribution end - for (auto mgd_iter : multiget_cf_data) { - auto mgd = mgd_iter.second; - if (sv_from_thread_local) { - ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version); - } else { - TEST_SYNC_POINT("DBImpl::MultiGet::BeforeLastTryUnRefSV"); - CleanupSuperVersion(mgd.super_version); - } - } - } - - RecordTick(stats_, NUMBER_MULTIGET_CALLS); - RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); - RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found); - RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); - RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read); - PERF_COUNTER_ADD(multiget_read_bytes, bytes_read); - PERF_TIMER_STOP(get_post_process_time); - - return stat_list; -} - template Status DBImpl::MultiCFSnapshot( const ReadOptions& read_options, ReadCallback* callback, @@ -3332,20 +3124,24 @@ Status DBImpl::MultiCFSnapshot( // sv_from_thread_local set to false means the SuperVersion to be cleaned up // is acquired directly via ColumnFamilyData instead of thread local. const auto sv_cleanup_func = [&]() -> void { - for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); - ++cf_iter) { - auto node = iter_deref_func(cf_iter); - SuperVersion* super_version = node->super_version; - ColumnFamilyData* cfd = node->cfd; - if (super_version != nullptr) { - if (*sv_from_thread_local) { - ReturnAndCleanupSuperVersion(cfd, super_version); - } else { - CleanupSuperVersion(super_version); + // RocksDB-Cloud contribution begin + if (!dynamic_cast(read_options.snapshot)) { + for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); + ++cf_iter) { + auto node = iter_deref_func(cf_iter); + SuperVersion* super_version = node->super_version; + ColumnFamilyData* cfd = node->cfd; + if (super_version != nullptr) { + if (*sv_from_thread_local) { + ReturnAndCleanupSuperVersion(cfd, super_version); + } else { + CleanupSuperVersion(super_version); + } } + node->super_version = nullptr; } - node->super_version = nullptr; } + // RocksDB-Cloud contribution end }; bool last_try = false; @@ -3357,6 +3153,13 @@ Status DBImpl::MultiCFSnapshot( // RocksDB-Cloud contribution begin auto super_snapshot = dynamic_cast(read_options.snapshot); + if (super_snapshot && + node->cfd->GetID() != super_snapshot->cfd()->GetID()) { + std::ostringstream oss; + oss << "SuperSnapshot column family " << super_snapshot->cfd()->GetName() + << " doesn't match provided column family " << node->cfd->GetName(); + return Status::InvalidArgument(oss.str()); + } node->super_version = super_snapshot ? super_snapshot->sv() : GetAndRefSuperVersion(node->cfd); // RocksDB-Cloud contribution end @@ -3394,7 +3197,11 @@ Status DBImpl::MultiCFSnapshot( } else { // RocksDB-Cloud contribution begin // MultiGet across column families is not supported with super snapshot - assert(!dynamic_cast(read_options.snapshot)); + if (dynamic_cast(read_options.snapshot)) { + return Status::InvalidArgument( + "MultiGet with SuperSnapshot doesn't support multiple column " + "families"); + } // RocksDB-Cloud contribution end // If we end up with the same issue of memtable getting sealed during 2 // consecutive retries, it means the write rate is very high. In that case @@ -3473,7 +3280,8 @@ Status DBImpl::MultiCFSnapshot( } } - // Keep track of bytes that we read for statistics-recording later + TEST_SYNC_POINT("DBImpl::MultiCFSnapshot:AfterGetSeqNum1"); + TEST_SYNC_POINT("DBImpl::MultiCFSnapshot:AfterGetSeqNum2"); PERF_TIMER_STOP(get_snapshot_time); *sv_from_thread_local = !last_try; if (!s.ok()) { @@ -3482,14 +3290,6 @@ Status DBImpl::MultiCFSnapshot( return s; } -void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, - ColumnFamilyHandle** column_families, const Slice* keys, - PinnableSlice* values, Status* statuses, - const bool sorted_input) { - MultiGet(read_options, num_keys, column_families, keys, values, - /* timestamps */ nullptr, statuses, sorted_input); -} - void DBImpl::MultiGet(const ReadOptions& _read_options, const size_t num_keys, ColumnFamilyHandle** column_families, const Slice* keys, PinnableSlice* values, std::string* timestamps, @@ -3525,16 +3325,6 @@ void DBImpl::MultiGetCommon(const ReadOptions& read_options, return; } - // RocksDB-Cloud contribution begin - if (dynamic_cast(read_options.snapshot)) { - for (size_t i = 0; i < num_keys; ++i) { - statuses[i] = Status::NotSupported( - "This variant of MultiGet does not yet support super snapshots"); - } - return; - } - // RocksDB-Cloud contribution end - bool should_fail = false; for (size_t i = 0; i < num_keys; ++i) { ColumnFamilyHandle* cfh = column_families[i]; @@ -3663,14 +3453,18 @@ void DBImpl::MultiGetCommon(const ReadOptions& read_options, } } - for (const auto& iter : multiget_cf_data) { - if (sv_from_thread_local) { - ReturnAndCleanupSuperVersion(iter.cfd, iter.super_version); - } else { - TEST_SYNC_POINT("DBImpl::MultiGet::BeforeLastTryUnRefSV"); - CleanupSuperVersion(iter.super_version); + // RocksDB-Cloud contribution begin + if (!dynamic_cast(read_options.snapshot)) { + for (const auto& iter : multiget_cf_data) { + if (sv_from_thread_local) { + ReturnAndCleanupSuperVersion(iter.cfd, iter.super_version); + } else { + TEST_SYNC_POINT("DBImpl::MultiGet::BeforeLastTryUnRefSV"); + CleanupSuperVersion(iter.super_version); + } } } + // RocksDB-Cloud contribution end } namespace { @@ -3717,38 +3511,23 @@ void DBImpl::PrepareMultiGetKeys( CompareKeyContext()); } -void DBImpl::MultiGet(const ReadOptions& read_options, - ColumnFamilyHandle* column_family, const size_t num_keys, - const Slice* keys, PinnableSlice* values, - Status* statuses, const bool sorted_input) { - MultiGet(read_options, column_family, num_keys, keys, values, - /* timestamps */ nullptr, statuses, sorted_input); -} - -void DBImpl::MultiGet(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family, const size_t num_keys, - const Slice* keys, PinnableSlice* values, - std::string* timestamps, Status* statuses, - const bool sorted_input) { - if (_read_options.io_activity != Env::IOActivity::kUnknown && - _read_options.io_activity != Env::IOActivity::kMultiGet) { - Status s = Status::InvalidArgument( - "Can only call MultiGet with `ReadOptions::io_activity` is " - "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); - for (size_t i = 0; i < num_keys; ++i) { - if (statuses[i].ok()) { - statuses[i] = s; - } - } - return; - } - - ReadOptions read_options(_read_options); - if (read_options.io_activity == Env::IOActivity::kUnknown) { - read_options.io_activity = Env::IOActivity::kMultiGet; +void DB::MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool sorted_input) { + // Use std::array, if possible, to avoid memory allocation overhead + if (num_keys > MultiGetContext::MAX_BATCH_SIZE) { + std::vector column_families(num_keys, column_family); + MultiGet(options, num_keys, column_families.data(), keys, values, + timestamps, statuses, sorted_input); + } else { + std::array + column_families; + std::fill(column_families.begin(), column_families.begin() + num_keys, + column_family); + MultiGet(options, num_keys, column_families.data(), keys, values, + timestamps, statuses, sorted_input); } - MultiGetCommon(read_options, column_family, num_keys, keys, values, - /* columns */ nullptr, timestamps, statuses, sorted_input); } void DBImpl::MultiGetCommon(const ReadOptions& read_options, @@ -4114,6 +3893,7 @@ void DBImpl::MultiGetEntity(const ReadOptions& _read_options, size_t num_keys, } Status DBImpl::WrapUpCreateColumnFamilies( + const ReadOptions& read_options, const WriteOptions& write_options, const std::vector& cf_options) { // NOTE: this function is skipped for create_missing_column_families and // DB::Open, so new functionality here might need to go into Open also. @@ -4126,26 +3906,32 @@ Status DBImpl::WrapUpCreateColumnFamilies( } } // Attempt both follow-up actions even if one fails - Status s = WriteOptionsFile(false /*db_mutex_already_held*/); + Status s = WriteOptionsFile(write_options, false /*db_mutex_already_held*/); if (register_worker) { - s.UpdateIfOk(RegisterRecordSeqnoTimeWorker(/*from_db_open=*/false)); + s.UpdateIfOk(RegisterRecordSeqnoTimeWorker(read_options, write_options, + /* is_new_db */ false)); } return s; } -Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, +Status DBImpl::CreateColumnFamily(const ReadOptions& read_options, + const WriteOptions& write_options, + const ColumnFamilyOptions& cf_options, const std::string& column_family, ColumnFamilyHandle** handle) { assert(handle != nullptr); InstrumentedMutexLock ol(&options_mutex_); - Status s = CreateColumnFamilyImpl(cf_options, column_family, handle); + Status s = CreateColumnFamilyImpl(read_options, write_options, cf_options, + column_family, handle); if (s.ok()) { - s.UpdateIfOk(WrapUpCreateColumnFamilies({&cf_options})); + s.UpdateIfOk( + WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options})); } return s; } Status DBImpl::CreateColumnFamilies( + const ReadOptions& read_options, const WriteOptions& write_options, const ColumnFamilyOptions& cf_options, const std::vector& column_family_names, std::vector* handles) { @@ -4157,7 +3943,8 @@ Status DBImpl::CreateColumnFamilies( bool success_once = false; for (size_t i = 0; i < num_cf; i++) { ColumnFamilyHandle* handle; - s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle); + s = CreateColumnFamilyImpl(read_options, write_options, cf_options, + column_family_names[i], &handle); if (!s.ok()) { break; } @@ -4165,12 +3952,14 @@ Status DBImpl::CreateColumnFamilies( success_once = true; } if (success_once) { - s.UpdateIfOk(WrapUpCreateColumnFamilies({&cf_options})); + s.UpdateIfOk( + WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options})); } return s; } Status DBImpl::CreateColumnFamilies( + const ReadOptions& read_options, const WriteOptions& write_options, const std::vector& column_families, std::vector* handles) { assert(handles != nullptr); @@ -4183,7 +3972,8 @@ Status DBImpl::CreateColumnFamilies( cf_opts.reserve(num_cf); for (size_t i = 0; i < num_cf; i++) { ColumnFamilyHandle* handle; - s = CreateColumnFamilyImpl(column_families[i].options, + s = CreateColumnFamilyImpl(read_options, write_options, + column_families[i].options, column_families[i].name, &handle); if (!s.ok()) { break; @@ -4193,17 +3983,18 @@ Status DBImpl::CreateColumnFamilies( cf_opts.push_back(&column_families[i].options); } if (success_once) { - s.UpdateIfOk(WrapUpCreateColumnFamilies(cf_opts)); + s.UpdateIfOk( + WrapUpCreateColumnFamilies(read_options, write_options, cf_opts)); } return s; } -Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, +Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options, + const WriteOptions& write_options, + const ColumnFamilyOptions& cf_options, const std::string& column_family_name, ColumnFamilyHandle** handle) { options_mutex_.AssertHeld(); - // TODO: plumb Env::IOActivity - const ReadOptions read_options; Status s; *handle = nullptr; @@ -4247,7 +4038,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, // LogAndApply will both write the creation in MANIFEST and create // ColumnFamilyData object s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), - read_options, &edit, &mutex_, + read_options, write_options, &edit, &mutex_, directories_.GetDbDir(), false, &cf_options); write_thread_.ExitUnbatched(&w); } @@ -4296,7 +4087,8 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { InstrumentedMutexLock ol(&options_mutex_); Status s = DropColumnFamilyImpl(column_family); if (s.ok()) { - s = WriteOptionsFile(false /*db_mutex_already_held*/); + // TODO: plumb Env::IOActivity, Env::IOPriority + s = WriteOptionsFile(WriteOptions(), false /*db_mutex_already_held*/); } return s; } @@ -4314,8 +4106,9 @@ Status DBImpl::DropColumnFamilies( success_once = true; } if (success_once) { + // TODO: plumb Env::IOActivity, Env::IOPriority Status persist_options_status = - WriteOptionsFile(false /*db_mutex_already_held*/); + WriteOptionsFile(WriteOptions(), false /*db_mutex_already_held*/); if (s.ok() && !persist_options_status.ok()) { s = persist_options_status; } @@ -4324,8 +4117,10 @@ Status DBImpl::DropColumnFamilies( } Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); if (cfd->GetID() == 0) { @@ -4349,7 +4144,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { WriteThread::Writer w; write_thread_.EnterUnbatched(&w, &mutex_); s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - read_options, &edit, &mutex_, + read_options, write_options, &edit, &mutex_, directories_.GetDbDir()); write_thread_.ExitUnbatched(&w); } @@ -4376,7 +4171,8 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { if (cfd->ioptions()->preserve_internal_time_seconds > 0 || cfd->ioptions()->preclude_last_level_data_seconds > 0) { - s = RegisterRecordSeqnoTimeWorker(/*from_db_open=*/false); + s = RegisterRecordSeqnoTimeWorker(read_options, write_options, + /* is_new_db */ false); } if (s.ok()) { @@ -4407,7 +4203,7 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options, // falsify later if key-may-exist but can't fetch value *value_found = true; } - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions roptions = read_options; roptions.read_tier = kBlockCacheTier; // read from block cache only PinnableSlice pinnable_val; @@ -4463,9 +4259,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options, } auto cfh = static_cast_with_check(column_family); + assert(cfh != nullptr); ColumnFamilyData* cfd = cfh->cfd(); assert(cfd != nullptr); - ReadCallback* read_callback = nullptr; // No read callback provided. // RocksDB-Cloud contribution begin auto super_snapshot = dynamic_cast(read_options.snapshot); @@ -4495,8 +4291,8 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options, result = NewDBIterator( env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber, - sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback, - this, cfd); + sv->mutable_cf_options.max_sequential_skip_in_iterations, + nullptr /* read_callback */, cfh); } else { // RocksDB-Cloud contribution begin if (super_snapshot && cfd->GetID() != super_snapshot->cfd()->GetID()) { @@ -4512,18 +4308,18 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options, // Note: no need to consider the special case of // last_seq_same_as_publish_seq_==false since NewIterator is overridden in // WritePreparedTxnDB - result = NewIteratorImpl(read_options, cfd, sv, + result = NewIteratorImpl(read_options, cfh, sv, (read_options.snapshot != nullptr) ? read_options.snapshot->GetSequenceNumber() : kMaxSequenceNumber, - read_callback); + nullptr /* read_callback */); } return result; } ArenaWrappedDBIter* DBImpl::NewIteratorImpl( - const ReadOptions& read_options, ColumnFamilyData* cfd, SuperVersion* sv, - SequenceNumber snapshot, ReadCallback* read_callback, + const ReadOptions& read_options, ColumnFamilyHandleImpl* cfh, + SuperVersion* sv, SequenceNumber snapshot, ReadCallback* read_callback, bool expose_blob_index, bool allow_refresh) { TEST_SYNC_POINT("DBImpl::NewIterator:1"); TEST_SYNC_POINT("DBImpl::NewIterator:2"); @@ -4586,19 +4382,44 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl( // likely that any iterator pointer is close to the iterator it points to so // that they are likely to be in the same cache line and/or page. ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( - env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, sv->current, - snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, read_callback, this, cfd, expose_blob_index, - allow_refresh); + env_, read_options, *cfh->cfd()->ioptions(), sv->mutable_cf_options, + sv->current, snapshot, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_callback, cfh, expose_blob_index, allow_refresh); InternalIterator* internal_iter = NewInternalIterator( - db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), snapshot, + db_iter->GetReadOptions(), cfh->cfd(), sv, db_iter->GetArena(), snapshot, /* allow_unprepared_value */ true, db_iter); db_iter->SetIterUnderDBIter(internal_iter); return db_iter; } +std::unique_ptr DBImpl::NewMultiCfIterator( + const ReadOptions& _read_options, + const std::vector& column_families) { + if (column_families.size() == 0) { + return std::unique_ptr(NewErrorIterator( + Status::InvalidArgument("No Column Family was provided"))); + } + const Comparator* first_comparator = column_families[0]->GetComparator(); + for (size_t i = 1; i < column_families.size(); ++i) { + const Comparator* cf_comparator = column_families[i]->GetComparator(); + if (first_comparator != cf_comparator && + first_comparator->GetId().compare(cf_comparator->GetId()) != 0) { + return std::unique_ptr(NewErrorIterator(Status::InvalidArgument( + "Different comparators are being used across CFs"))); + } + } + std::vector child_iterators; + Status s = NewIterators(_read_options, column_families, &child_iterators); + if (s.ok()) { + return std::make_unique(first_comparator, column_families, + std::move(child_iterators)); + } + return std::unique_ptr(NewErrorIterator(s)); +} + Status DBImpl::NewIterators( const ReadOptions& _read_options, const std::vector& column_families, @@ -4639,14 +4460,14 @@ Status DBImpl::NewIterators( } } - ReadCallback* read_callback = nullptr; // No read callback provided. iterators->clear(); iterators->reserve(column_families.size()); - autovector> cfd_to_sv; + autovector> cfh_to_sv; const bool check_read_ts = read_options.timestamp && read_options.timestamp->size() > 0; - for (auto cfh : column_families) { - auto cfd = static_cast_with_check(cfh)->cfd(); + for (auto cf : column_families) { + auto cfh = static_cast_with_check(cf); + auto cfd = cfh->cfd(); // RocksDB-Cloud contribution begin auto super_snapshot = @@ -4664,32 +4485,32 @@ Status DBImpl::NewIterators( : cfd->GetReferencedSuperVersion(this); // RocksDB-Cloud contribution end - cfd_to_sv.emplace_back(cfd, sv); + cfh_to_sv.emplace_back(cfh, sv); if (check_read_ts) { const Status s = FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp)); if (!s.ok()) { - for (auto prev_entry : cfd_to_sv) { + for (auto prev_entry : cfh_to_sv) { CleanupSuperVersion(std::get<1>(prev_entry)); } return s; } } } - assert(cfd_to_sv.size() == column_families.size()); + assert(cfh_to_sv.size() == column_families.size()); if (read_options.tailing) { if (dynamic_cast(read_options.snapshot)) { return Status::NotSupported( "Tailing iterator not supported with super snapshot"); } - for (auto [cfd, sv] : cfd_to_sv) { - auto iter = new ForwardIterator(this, read_options, cfd, sv, + for (auto [cfh, sv] : cfh_to_sv) { + auto iter = new ForwardIterator(this, read_options, cfh->cfd(), sv, /* allow_unprepared_value */ true); iterators->push_back(NewDBIterator( - env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, - cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber, + env_, read_options, *cfh->cfd()->ioptions(), sv->mutable_cf_options, + cfh->cfd()->user_comparator(), iter, sv->current, kMaxSequenceNumber, sv->mutable_cf_options.max_sequential_skip_in_iterations, - read_callback, this, cfd)); + nullptr /*read_callback*/, cfh)); } } else { // Note: no need to consider the special case of @@ -4698,9 +4519,9 @@ Status DBImpl::NewIterators( auto snapshot = read_options.snapshot != nullptr ? read_options.snapshot->GetSequenceNumber() : versions_->LastSequence(); - for (auto [cfd, sv] : cfd_to_sv) { - iterators->push_back( - NewIteratorImpl(read_options, cfd, sv, snapshot, read_callback)); + for (auto [cfh, sv] : cfh_to_sv) { + iterators->push_back(NewIteratorImpl(read_options, cfh, sv, snapshot, + nullptr /*read_callback*/)); } } @@ -4963,7 +4784,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) { } // RocksDB-Cloud contribution end - const SnapshotImpl* casted_s = reinterpret_cast(s); + const SnapshotImpl* casted_s = static_cast(s); { InstrumentedMutexLock l(&mutex_); snapshots_.Delete(casted_s); @@ -5022,7 +4843,7 @@ Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, version->Ref(); mutex_.Unlock(); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = version->GetPropertiesOfAllTables(read_options, props); @@ -5046,9 +4867,27 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, version->Ref(); mutex_.Unlock(); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; - auto s = version->GetPropertiesOfTablesInRange(read_options, range, n, props); + const Comparator* const ucmp = cfd->user_comparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + + autovector ukey_ranges; + std::vector keys; + ukey_ranges.reserve(n); + keys.reserve(2 * n); + // Add timestamp if needed + for (size_t i = 0; i < n; i++) { + auto [start, limit] = MaybeAddTimestampsToRange( + &range[i].start, &range[i].limit, ts_sz, &keys.emplace_back(), + &keys.emplace_back(), /*exclusive_end=*/false); + assert(start.has_value()); + assert(limit.has_value()); + ukey_ranges.emplace_back(start.value(), limit.value()); + } + auto s = + version->GetPropertiesOfTablesInRange(read_options, ukey_ranges, props); // Decrement the ref count mutex_.Lock(); @@ -5388,7 +5227,7 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, SuperVersion* sv = GetAndRefSuperVersion(cfd); v = sv->current; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; for (int i = 0; i < n; i++) { // Add timestamp if needed @@ -5452,8 +5291,10 @@ Status DBImpl::GetUpdatesSince( } Status DBImpl::DeleteFile(std::string name) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + uint64_t number; FileType type; WalFileType log_type; @@ -5533,12 +5374,12 @@ Status DBImpl::DeleteFile(std::string name) { edit.SetColumnFamily(cfd->GetID()); edit.DeleteFile(level, number); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - read_options, &edit, &mutex_, + read_options, write_options, &edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { - InstallSuperVersionAndScheduleWork(cfd, - &job_context.superversion_contexts[0], - *cfd->GetLatestMutableCFOptions()); + InstallSuperVersionAndScheduleWork( + cfd, job_context.superversion_contexts.data(), + *cfd->GetLatestMutableCFOptions()); } FindObsoleteFiles(&job_context, false); } // lock released here @@ -5556,11 +5397,31 @@ Status DBImpl::DeleteFile(std::string name) { Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, const RangePtr* ranges, size_t n, bool include_end) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + Status status = Status::OK(); auto cfh = static_cast_with_check(column_family); ColumnFamilyData* cfd = cfh->cfd(); + const Comparator* ucmp = cfd->user_comparator(); + assert(ucmp); + const size_t ts_sz = ucmp->timestamp_size(); + autovector ukey_ranges; + std::vector keys; + std::vector key_slices; + ukey_ranges.reserve(n); + keys.reserve(2 * n); + key_slices.reserve(2 * n); + for (size_t i = 0; i < n; i++) { + auto [start, limit] = MaybeAddTimestampsToRange( + ranges[i].start, ranges[i].limit, ts_sz, &keys.emplace_back(), + &keys.emplace_back(), !include_end); + assert((ranges[i].start != nullptr) == start.has_value()); + assert((ranges[i].limit != nullptr) == limit.has_value()); + ukey_ranges.emplace_back(start, limit); + } + VersionEdit edit; std::set deleted_files; JobContext job_context(next_job_id_.fetch_add(1), true); @@ -5569,8 +5430,9 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, Version* input_version = cfd->current(); auto* vstorage = input_version->storage_info(); - for (size_t r = 0; r < n; r++) { - auto begin = ranges[r].start, end = ranges[r].limit; + for (const auto& range : ukey_ranges) { + auto begin = range.start.has_value() ? &range.start.value() : nullptr; + auto end = range.limit.has_value() ? &range.limit.value() : nullptr; for (int i = 1; i < cfd->NumberLevels(); i++) { if (vstorage->LevelFiles(i).empty() || !vstorage->OverlapInLevel(i, begin, end)) { @@ -5604,8 +5466,8 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, continue; } if (!include_end && end != nullptr && - cfd->user_comparator()->Compare(level_file->largest.user_key(), - *end) == 0) { + (ucmp->CompareWithoutTimestamp(level_file->largest.user_key(), + *end) == 0)) { continue; } edit.SetColumnFamily(cfd->GetID()); @@ -5625,12 +5487,12 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, } input_version->Ref(); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - read_options, &edit, &mutex_, + read_options, write_options, &edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { - InstallSuperVersionAndScheduleWork(cfd, - &job_context.superversion_contexts[0], - *cfd->GetLatestMutableCFOptions()); + InstallSuperVersionAndScheduleWork( + cfd, job_context.superversion_contexts.data(), + *cfd->GetLatestMutableCFOptions()); } for (auto* deleted_file : deleted_files) { deleted_file->being_compacted = false; @@ -5865,7 +5727,7 @@ Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) { return Status::OK(); } -DB::~DB() {} +DB::~DB() = default; Status DBImpl::Close() { InstrumentedMutexLock closing_lock_guard(&closing_mutex_); @@ -5892,7 +5754,7 @@ Status DB::ListColumnFamilies(const DBOptions& db_options, return VersionSet::ListColumnFamilies(column_families, name, fs.get()); } -Snapshot::~Snapshot() {} +Snapshot::~Snapshot() = default; Status DestroyDB(const std::string& dbname, const Options& options, const std::vector& column_families) { @@ -6039,7 +5901,8 @@ Status DestroyDB(const std::string& dbname, const Options& options, return result; } -Status DBImpl::WriteOptionsFile(bool db_mutex_already_held) { +Status DBImpl::WriteOptionsFile(const WriteOptions& write_options, + bool db_mutex_already_held) { options_mutex_.AssertHeld(); if (!immutable_db_options_.use_options_file) { return Status::OK(); @@ -6076,8 +5939,8 @@ Status DBImpl::WriteOptionsFile(bool db_mutex_already_held) { std::string file_name = TempOptionsFileName(GetName(), versions_->NewFileNumber()); - Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name, - fs_.get()); + Status s = PersistRocksDBOptions(write_options, db_options, cf_names, cf_opts, + file_name, fs_.get()); if (s.ok()) { s = RenameTempFileToOptionsFile(file_name); @@ -6190,16 +6053,23 @@ Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) { } } } + if (s.ok()) { - InstrumentedMutexLock l(&mutex_); - versions_->options_file_number_ = options_file_number; - versions_->options_file_size_ = options_file_size; - } + int my_disable_delete_obsolete_files; + + { + InstrumentedMutexLock l(&mutex_); + versions_->options_file_number_ = options_file_number; + versions_->options_file_size_ = options_file_size; + my_disable_delete_obsolete_files = disable_delete_obsolete_files_; + } - if (0 == disable_delete_obsolete_files_) { - // TODO: Should we check for errors here? - DeleteObsoleteOptionsFiles().PermitUncheckedError(); + if (!my_disable_delete_obsolete_files) { + // TODO: Should we check for errors here? + DeleteObsoleteOptionsFiles().PermitUncheckedError(); + } } + return s; } @@ -6270,7 +6140,7 @@ Status DBImpl::GetLatestSequenceForKey( MergeContext merge_context; SequenceNumber max_covering_tombstone_seq = 0; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions read_options; SequenceNumber current_seq = versions_->LastSequence(); @@ -6426,8 +6296,10 @@ Status DBImpl::IngestExternalFile( Status DBImpl::IngestExternalFiles( const std::vector& args) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + if (args.empty()) { return Status::InvalidArgument("ingestion arg list is empty"); } @@ -6454,10 +6326,18 @@ Status DBImpl::IngestExternalFiles( } for (const auto& arg : args) { const IngestExternalFileOptions& ingest_opts = arg.options; - if (ingest_opts.ingest_behind && - !immutable_db_options_.allow_ingest_behind) { - return Status::InvalidArgument( - "can't ingest_behind file in DB with allow_ingest_behind=false"); + if (ingest_opts.ingest_behind) { + if (!immutable_db_options_.allow_ingest_behind) { + return Status::InvalidArgument( + "can't ingest_behind file in DB with allow_ingest_behind=false"); + } + auto ucmp = arg.column_family->GetComparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + return Status::NotSupported( + "Column family with user-defined " + "timestamps enabled doesn't support ingest behind."); + } } } @@ -6645,9 +6525,10 @@ Status DBImpl::IngestExternalFiles( } assert(0 == num_entries); } - status = versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list, - read_options, edit_lists, &mutex_, - directories_.GetDbDir()); + status = versions_->LogAndApply( + cfds_to_commit, mutable_cf_options_list, read_options, write_options, + + edit_lists, &mutex_, directories_.GetDbDir()); // It is safe to update VersionSet last seqno here after LogAndApply since // LogAndApply persists last sequence number from VersionEdits, // which are from file's largest seqno and not from VersionSet. @@ -6749,8 +6630,10 @@ Status DBImpl::CreateColumnFamilyWithImport( ColumnFamilyHandle** handle) { assert(handle != nullptr); assert(*handle == nullptr); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + std::string cf_comparator_name = options.comparator->Name(); size_t total_file_num = 0; @@ -6766,7 +6649,8 @@ Status DBImpl::CreateColumnFamilyWithImport( } // Create column family. - auto status = CreateColumnFamily(options, column_family_name, handle); + auto status = CreateColumnFamily(read_options, write_options, options, + column_family_name, handle); if (!status.ok()) { return status; } @@ -6802,8 +6686,8 @@ Status DBImpl::CreateColumnFamilyWithImport( next_file_number = versions_->FetchAddFileNumber(total_file_num); auto cf_options = cfd->GetLatestMutableCFOptions(); status = - versions_->LogAndApply(cfd, *cf_options, read_options, &dummy_edit, - &mutex_, directories_.GetDbDir()); + versions_->LogAndApply(cfd, *cf_options, read_options, write_options, + &dummy_edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); } @@ -6840,8 +6724,8 @@ Status DBImpl::CreateColumnFamilyWithImport( if (status.ok()) { auto cf_options = cfd->GetLatestMutableCFOptions(); status = versions_->LogAndApply(cfd, *cf_options, read_options, - import_job.edit(), &mutex_, - directories_.GetDbDir()); + write_options, import_job.edit(), + &mutex_, directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options); } @@ -6905,8 +6789,8 @@ Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family, if (status.ok()) { // DeleteFilesInRanges non-overlap files except L0 std::vector ranges; - ranges.push_back(RangePtr(nullptr, &begin_key)); - ranges.push_back(RangePtr(&end_key, nullptr)); + ranges.emplace_back(nullptr, &begin_key); + ranges.emplace_back(&end_key, nullptr); status = DeleteFilesInRanges(column_family, ranges.data(), ranges.size()); } @@ -6925,6 +6809,7 @@ Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family, empty_after_delete = true; } else { const Comparator* const ucmp = column_family->GetComparator(); + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions wo; // Delete [smallest_user_key, clip_begin_key) if (ucmp->Compare(smallest_user_key, begin_key) < 0) { @@ -6954,7 +6839,7 @@ Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family, // last level to compact to and that range tombstones are not dropped // during non-bottommost compactions, calling CompactRange() on these two // ranges may not clear all range tombstones. - status = CompactRange(compact_options, nullptr, nullptr); + status = CompactRange(compact_options, column_family, nullptr, nullptr); } return status; } @@ -7153,7 +7038,7 @@ void DBImpl::NotifyOnExternalFileIngested( info.internal_file_path = f.internal_file_path; info.global_seqno = f.assigned_seqno; info.table_properties = f.table_properties; - for (auto listener : immutable_db_options_.listeners) { + for (const auto& listener : immutable_db_options_.listeners) { listener->OnExternalFileIngested(this, info); } } @@ -7245,8 +7130,10 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion( ColumnFamilyData* cfd, uint64_t num, std::unique_ptr::iterator>& pending_output_elem, uint64_t* next_file_number) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + Status s; SuperVersionContext dummy_sv_ctx(true /* create_superversion */); assert(nullptr != next_file_number); @@ -7264,8 +7151,8 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion( // reuse the file number that has already assigned to the internal file, // and this will overwrite the external file. To protect the external // file, we have to make sure the file number will never being reused. - s = versions_->LogAndApply(cfd, *cf_options, read_options, &dummy_edit, - &mutex_, directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, *cf_options, read_options, write_options, + &dummy_edit, &mutex_, directories_.GetDbDir()); if (s.ok()) { InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); } @@ -7311,28 +7198,27 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) { immutable_db_options_.clock->GetCurrentTime(&unix_time_signed) .PermitUncheckedError(); // Ignore error uint64_t unix_time = static_cast(unix_time_signed); - bool appended = false; - { - InstrumentedMutexLock l(&mutex_); - if (populate_historical_seconds > 0) { + + std::vector sv_contexts; + if (populate_historical_seconds > 0) { + bool success = true; + { + InstrumentedMutexLock l(&mutex_); if (seqno > 1 && unix_time > populate_historical_seconds) { // seqno=0 is reserved SequenceNumber from_seqno = 1; - appended = seqno_to_time_mapping_.PrePopulate( + success = seqno_to_time_mapping_.PrePopulate( from_seqno, seqno, unix_time - populate_historical_seconds, unix_time); + InstallSeqnoToTimeMappingInSV(&sv_contexts); } else { // One of these will fail assert(seqno > 1); assert(unix_time > populate_historical_seconds); + success = false; } - } else { - // FIXME: assert(seqno > 0); - appended = seqno_to_time_mapping_.Append(seqno, unix_time); } - } - if (populate_historical_seconds > 0) { - if (appended) { + if (success) { ROCKS_LOG_INFO( immutable_db_options_.info_log, "Pre-populated sequence number to time entries: [1,%" PRIu64 @@ -7345,11 +7231,17 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) { "] -> [%" PRIu64 ",%" PRIu64 "]", seqno, unix_time - populate_historical_seconds, unix_time); } - } else if (!appended) { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "Failed to insert sequence number to time entry: %" PRIu64 - " -> %" PRIu64, - seqno, unix_time); + } else { + InstrumentedMutexLock l(&mutex_); + // FIXME: assert(seqno > 0); + // Always successful assuming seqno never go backwards + seqno_to_time_mapping_.Append(seqno, unix_time); + InstallSeqnoToTimeMappingInSV(&sv_contexts); + } + + // clean up outside db mutex + for (SuperVersionContext& sv_context : sv_contexts) { + sv_context.Clean(); } } @@ -7377,4 +7269,22 @@ thread_local bool threadLogging{false}; void SetThreadLogging(bool v) { threadLogging = v; } bool GetThreadLogging() { return threadLogging; } +void DBImpl::InstallSeqnoToTimeMappingInSV( + std::vector* sv_contexts) { + mutex_.AssertHeld(); + std::shared_ptr new_seqno_to_time_mapping = + std::make_shared(); + new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_); + for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + sv_contexts->emplace_back(/*create_superversion=*/true); + sv_contexts->back().new_seqno_to_time_mapping = new_seqno_to_time_mapping; + cfd->InstallSuperVersion(&sv_contexts->back(), + *(cfd->GetLatestMutableCFOptions())); + } + bg_cv_.SignalAll(); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index bf61c847336..d49f8c0e20e 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -60,7 +60,6 @@ #include "rocksdb/utilities/replayer.h" #include "rocksdb/write_buffer_manager.h" #include "table/merging_iterator.h" -#include "table/scoped_arena_iterator.h" #include "util/autovector.h" #include "util/hash.h" #include "util/repeatable_thread.h" @@ -231,16 +230,12 @@ class DBImpl : public DB { const Slice& end_key, const Slice& ts) override; using DB::Write; - virtual Status Write(const WriteOptions& options, - WriteBatch* updates) override; + Status Write(const WriteOptions& options, WriteBatch* updates) override; using DB::Get; - virtual Status Get(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override; - virtual Status Get(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value, std::string* timestamp) override; + Status Get(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) override; using DB::GetEntity; Status GetEntity(const ReadOptions& options, @@ -265,17 +260,6 @@ class DBImpl : public DB { } using DB::MultiGet; - virtual std::vector MultiGet( - const ReadOptions& options, - const std::vector& column_family, - const std::vector& keys, - std::vector* values) override; - virtual std::vector MultiGet( - const ReadOptions& _read_options, - const std::vector& column_family, - const std::vector& keys, std::vector* values, - std::vector* timestamps) override; - // This MultiGet is a batched version, which may be faster than calling Get // multiple times, especially if the keys have some spatial locality that // enables them to be queried in the same SST files/set of files. The larger @@ -283,19 +267,6 @@ class DBImpl : public DB { // The values and statuses parameters are arrays with number of elements // equal to keys.size(). This allows the storage for those to be alloacted // by the caller on the stack for small batches - void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family, - const size_t num_keys, const Slice* keys, PinnableSlice* values, - Status* statuses, const bool sorted_input = false) override; - void MultiGet(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family, const size_t num_keys, - const Slice* keys, PinnableSlice* values, - std::string* timestamps, Status* statuses, - const bool sorted_input = false) override; - - void MultiGet(const ReadOptions& options, const size_t num_keys, - ColumnFamilyHandle** column_families, const Slice* keys, - PinnableSlice* values, Status* statuses, - const bool sorted_input = false) override; void MultiGet(const ReadOptions& _read_options, const size_t num_keys, ColumnFamilyHandle** column_families, const Slice* keys, PinnableSlice* values, std::string* timestamps, @@ -321,18 +292,45 @@ class DBImpl : public DB { const Slice* keys, PinnableAttributeGroups* results) override; - virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options, + Status CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const std::string& column_family, + ColumnFamilyHandle** handle) override { + // TODO: plumb Env::IOActivity, Env::IOPriority + return CreateColumnFamily(ReadOptions(), WriteOptions(), cf_options, + column_family, handle); + } + virtual Status CreateColumnFamily(const ReadOptions& read_options, + const WriteOptions& write_options, + const ColumnFamilyOptions& cf_options, const std::string& column_family, - ColumnFamilyHandle** handle) override; + ColumnFamilyHandle** handle); + Status CreateColumnFamilies( + const ColumnFamilyOptions& cf_options, + const std::vector& column_family_names, + std::vector* handles) override { + // TODO: plumb Env::IOActivity, Env::IOPriority + return CreateColumnFamilies(ReadOptions(), WriteOptions(), cf_options, + column_family_names, handles); + } virtual Status CreateColumnFamilies( + const ReadOptions& read_options, const WriteOptions& write_options, const ColumnFamilyOptions& cf_options, const std::vector& column_family_names, - std::vector* handles) override; + std::vector* handles); + + Status CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) override { + // TODO: plumb Env::IOActivity, Env::IOPriority + return CreateColumnFamilies(ReadOptions(), WriteOptions(), column_families, + handles); + } virtual Status CreateColumnFamilies( + const ReadOptions& read_options, const WriteOptions& write_options, const std::vector& column_families, - std::vector* handles) override; - virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; - virtual Status DropColumnFamilies( + std::vector* handles); + Status DropColumnFamily(ColumnFamilyHandle* column_family) override; + Status DropColumnFamilies( const std::vector& column_families) override; // Returns false if key doesn't exist in the database and true if it may. @@ -340,28 +338,35 @@ class DBImpl : public DB { // memory. On return, if value was found, then value_found will be set to true // , otherwise false. using DB::KeyMayExist; - virtual bool KeyMayExist(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - std::string* value, std::string* timestamp, - bool* value_found = nullptr) override; + bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, std::string* timestamp, + bool* value_found = nullptr) override; using DB::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family) override; - virtual Status NewIterators( - const ReadOptions& _read_options, - const std::vector& column_families, - std::vector* iterators) override; + Iterator* NewIterator(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family) override; + Status NewIterators(const ReadOptions& _read_options, + const std::vector& column_families, + std::vector* iterators) override; + + // UNDER CONSTRUCTION - DO NOT USE + // Return a cross-column-family iterator from a consistent database state. + std::unique_ptr NewMultiCfIterator( + const ReadOptions& options, + const std::vector& column_families) override; SequenceNumber GetIteratorSequenceNumber(Iterator* it) override; - virtual const Snapshot* GetSnapshot() override; // RocksDB-Cloud contribution begin Status GetSuperSnapshots( const std::vector& column_families, std::vector* snapshots) override; // RocksDB-Cloud contribution end + + virtual const Snapshot* GetSnapshot() override; virtual void ReleaseSnapshot(const Snapshot* snapshot) override; + // Create a timestamped snapshot. This snapshot can be shared by multiple // readers. If any of them uses it for write conflict checking, then // is_write_conflict_boundary is true. For simplicity, set it to true by @@ -376,35 +381,33 @@ class DBImpl : public DB { timestamped_snapshots) const; using DB::GetProperty; - virtual bool GetProperty(ColumnFamilyHandle* column_family, - const Slice& property, std::string* value) override; + bool GetProperty(ColumnFamilyHandle* column_family, const Slice& property, + std::string* value) override; using DB::GetMapProperty; - virtual bool GetMapProperty( - ColumnFamilyHandle* column_family, const Slice& property, - std::map* value) override; + bool GetMapProperty(ColumnFamilyHandle* column_family, const Slice& property, + std::map* value) override; using DB::GetIntProperty; - virtual bool GetIntProperty(ColumnFamilyHandle* column_family, - const Slice& property, uint64_t* value) override; + bool GetIntProperty(ColumnFamilyHandle* column_family, const Slice& property, + uint64_t* value) override; using DB::GetAggregatedIntProperty; - virtual bool GetAggregatedIntProperty(const Slice& property, - uint64_t* aggregated_value) override; + bool GetAggregatedIntProperty(const Slice& property, + uint64_t* aggregated_value) override; using DB::GetApproximateSizes; - virtual Status GetApproximateSizes(const SizeApproximationOptions& options, - ColumnFamilyHandle* column_family, - const Range* range, int n, - uint64_t* sizes) override; + Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, + uint64_t* sizes) override; using DB::GetApproximateMemTableStats; - virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, - const Range& range, - uint64_t* const count, - uint64_t* const size) override; + void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, uint64_t* const count, + uint64_t* const size) override; using DB::CompactRange; - virtual Status CompactRange(const CompactRangeOptions& options, - ColumnFamilyHandle* column_family, - const Slice* begin, const Slice* end) override; + Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, const Slice* begin, + const Slice* end) override; using DB::CompactFiles; - virtual Status CompactFiles( + Status CompactFiles( const CompactionOptions& compact_options, ColumnFamilyHandle* column_family, const std::vector& input_file_names, const int output_level, @@ -412,14 +415,14 @@ class DBImpl : public DB { std::vector* const output_file_names = nullptr, CompactionJobInfo* compaction_job_info = nullptr) override; - virtual Status PauseBackgroundWork() override; - virtual Status ContinueBackgroundWork() override; + Status PauseBackgroundWork() override; + Status ContinueBackgroundWork() override; - virtual Status EnableAutoCompaction( + Status EnableAutoCompaction( const std::vector& column_family_handles) override; - virtual void EnableManualCompaction() override; - virtual void DisableManualCompaction() override; + void EnableManualCompaction() override; + void DisableManualCompaction() override; Status ApplyReplicationLogRecord(ReplicationLogRecord record, std::string replication_sequence, @@ -442,36 +445,40 @@ class DBImpl : public DB { ColumnFamilyHandle* column_family, const std::unordered_map& options_map) override; - virtual Status SetDBOptions( + Status SetDBOptions( const std::unordered_map& options_map) override; using DB::NumberLevels; - virtual int NumberLevels(ColumnFamilyHandle* column_family) override; + int NumberLevels(ColumnFamilyHandle* column_family) override; using DB::MaxMemCompactionLevel; - virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override; + int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override; using DB::Level0StopWriteTrigger; - virtual int Level0StopWriteTrigger( - ColumnFamilyHandle* column_family) override; - virtual const std::string& GetName() const override; - virtual Env* GetEnv() const override; - virtual FileSystem* GetFileSystem() const override; + int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) override; + const std::string& GetName() const override; + Env* GetEnv() const override; + FileSystem* GetFileSystem() const override; using DB::GetOptions; - virtual Options GetOptions(ColumnFamilyHandle* column_family) const override; + Options GetOptions(ColumnFamilyHandle* column_family) const override; using DB::GetDBOptions; - virtual DBOptions GetDBOptions() const override; + DBOptions GetDBOptions() const override; using DB::Flush; - virtual Status Flush(const FlushOptions& options, - ColumnFamilyHandle* column_family) override; - virtual Status Flush( + Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) override; + Status Flush( const FlushOptions& options, const std::vector& column_families) override; - virtual Status FlushWAL(bool sync) override; + Status FlushWAL(bool sync) override { + // TODO: plumb Env::IOActivity, Env::IOPriority + return FlushWAL(WriteOptions(), sync); + } + + virtual Status FlushWAL(const WriteOptions& write_options, bool sync); bool WALBufferIsEmpty(); - virtual Status SyncWAL() override; - virtual Status LockWAL() override; - virtual Status UnlockWAL() override; + Status SyncWAL() override; + Status LockWAL() override; + Status UnlockWAL() override; - virtual SequenceNumber GetLatestSequenceNumber() const override; + SequenceNumber GetLatestSequenceNumber() const override; // IncreaseFullHistoryTsLow(ColumnFamilyHandle*, std::string) will acquire // and release db_mutex @@ -483,21 +490,21 @@ class DBImpl : public DB { Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, std::string* ts_low) override; - virtual Status GetDbIdentity(std::string& identity) const override; + Status GetDbIdentity(std::string& identity) const override; virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const; - virtual Status GetDbSessionId(std::string& session_id) const override; + Status GetDbSessionId(std::string& session_id) const override; ColumnFamilyHandle* DefaultColumnFamily() const override; ColumnFamilyHandle* PersistentStatsColumnFamily() const; - virtual Status Close() override; + Status Close() override; - virtual Status DisableFileDeletions() override; + Status DisableFileDeletions() override; - virtual Status EnableFileDeletions(bool force) override; + Status EnableFileDeletions() override; virtual bool IsFileDeletionsEnabled() const; @@ -506,40 +513,35 @@ class DBImpl : public DB { std::unique_ptr* stats_iterator) override; using DB::ResetStats; - virtual Status ResetStats() override; + Status ResetStats() override; // All the returned filenames start with "/" - virtual Status GetLiveFiles(std::vector&, - uint64_t* manifest_file_size, - bool flush_memtable = true) override; - virtual Status GetSortedWalFiles(VectorLogPtr& files) override; - virtual Status GetCurrentWalFile( - std::unique_ptr* current_log_file) override; - virtual Status GetCreationTimeOfOldestFile( - uint64_t* creation_time) override; - - virtual Status GetUpdatesSince( + Status GetLiveFiles(std::vector&, uint64_t* manifest_file_size, + bool flush_memtable = true) override; + Status GetSortedWalFiles(VectorLogPtr& files) override; + Status GetCurrentWalFile(std::unique_ptr* current_log_file) override; + Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override; + + Status GetUpdatesSince( SequenceNumber seq_number, std::unique_ptr* iter, const TransactionLogIterator::ReadOptions& read_options = TransactionLogIterator::ReadOptions()) override; - virtual Status DeleteFile(std::string name) override; + Status DeleteFile(std::string name) override; Status DeleteFilesInRanges(ColumnFamilyHandle* column_family, const RangePtr* ranges, size_t n, bool include_end = true); - virtual void GetLiveFilesMetaData( - std::vector* metadata) override; + void GetLiveFilesMetaData(std::vector* metadata) override; - virtual Status GetLiveFilesChecksumInfo( - FileChecksumList* checksum_list) override; + Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) override; - virtual Status GetLiveFilesStorageInfo( + Status GetLiveFilesStorageInfo( const LiveFilesStorageInfoOptions& opts, std::vector* files) override; // Obtains the meta data of the specified column family of the DB. // TODO(yhchiang): output parameter is placed in the end in this codebase. - virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, - ColumnFamilyMetaData* metadata) override; + void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* metadata) override; void GetAllColumnFamilyMetaData( std::vector* metadata) override; @@ -551,32 +553,32 @@ class DBImpl : public DB { int target_level) override; using DB::IngestExternalFile; - virtual Status IngestExternalFile( + Status IngestExternalFile( ColumnFamilyHandle* column_family, const std::vector& external_files, const IngestExternalFileOptions& ingestion_options) override; using DB::IngestExternalFiles; - virtual Status IngestExternalFiles( + Status IngestExternalFiles( const std::vector& args) override; using DB::CreateColumnFamilyWithImport; - virtual Status CreateColumnFamilyWithImport( + Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, const std::vector& metadatas, ColumnFamilyHandle** handle) override; using DB::ClipColumnFamily; - virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, - const Slice& begin_key, - const Slice& end_key) override; + Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override; using DB::VerifyFileChecksums; Status VerifyFileChecksums(const ReadOptions& read_options) override; using DB::VerifyChecksum; - virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override; + Status VerifyChecksum(const ReadOptions& /*read_options*/) override; // Verify the checksums of files in db. Currently only tables are checked. // // read_options: controls file I/O behavior, e.g. read ahead size while @@ -597,18 +599,16 @@ class DBImpl : public DB { const ReadOptions& read_options); using DB::StartTrace; - virtual Status StartTrace( - const TraceOptions& options, - std::unique_ptr&& trace_writer) override; + Status StartTrace(const TraceOptions& options, + std::unique_ptr&& trace_writer) override; using DB::EndTrace; - virtual Status EndTrace() override; + Status EndTrace() override; using DB::NewDefaultReplayer; - virtual Status NewDefaultReplayer( - const std::vector& handles, - std::unique_ptr&& reader, - std::unique_ptr* replayer) override; + Status NewDefaultReplayer(const std::vector& handles, + std::unique_ptr&& reader, + std::unique_ptr* replayer) override; using DB::StartBlockCacheTrace; Status StartBlockCacheTrace( @@ -630,14 +630,12 @@ class DBImpl : public DB { Status EndIOTrace() override; using DB::GetPropertiesOfAllTables; - virtual Status GetPropertiesOfAllTables( - ColumnFamilyHandle* column_family, - TablePropertiesCollection* props) override; - virtual Status GetPropertiesOfTablesInRange( + Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override; + Status GetPropertiesOfTablesInRange( ColumnFamilyHandle* column_family, const Range* range, std::size_t n, TablePropertiesCollection* props) override; - // ---- End of implementations of the DB interface ---- SystemClock* GetSystemClock() const; @@ -680,8 +678,8 @@ class DBImpl : public DB { // If `snapshot` == kMaxSequenceNumber, set a recent one inside the file. ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options, - ColumnFamilyData* cfd, SuperVersion* sv, - SequenceNumber snapshot, + ColumnFamilyHandleImpl* cfh, + SuperVersion* sv, SequenceNumber snapshot, ReadCallback* read_callback, bool expose_blob_index = false, bool allow_refresh = true); @@ -1184,6 +1182,8 @@ class DBImpl : public DB { void TEST_UnlockMutex(); + InstrumentedMutex* TEST_Mutex() { return &mutex_; } + void TEST_SignalAllBgCv(); // REQUIRES: mutex locked @@ -1261,6 +1261,22 @@ class DBImpl : public DB { // populate_historical_seconds, now]. void RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds); + // Everytime DB's seqno to time mapping changed (which already hold the db + // mutex), we install a new SuperVersion in each column family with a shared + // copy of the new mapping while holding the db mutex. + // This is done for all column families even though the column family does not + // explicitly enabled the + // `preclude_last_level_data_seconds` or `preserve_internal_time_seconds` + // features. + // This mapping supports iterators to fulfill the + // "rocksdb.iterator.write-time" iterator property for entries in memtables. + // + // Since this new SuperVersion doesn't involve an LSM tree shape change, we + // don't schedule work after installing this SuperVersion. It returns the used + // `SuperVersionContext` for clean up after release mutex. + void InstallSeqnoToTimeMappingInSV( + std::vector* sv_contexts); + // Interface to block and signal the DB in case of stalling writes by // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface. // When DB needs to be blocked or signalled by WriteBufferManager, @@ -1441,7 +1457,8 @@ class DBImpl : public DB { // Persist options to options file. Must be holding options_mutex_. // Will lock DB mutex if !db_mutex_already_held. - Status WriteOptionsFile(bool db_mutex_already_held); + Status WriteOptionsFile(const WriteOptions& write_options, + bool db_mutex_already_held); Status CompactRangeInternal(const CompactRangeOptions& options, ColumnFamilyHandle* column_family, @@ -1567,7 +1584,8 @@ class DBImpl : public DB { virtual bool OwnTablesAndLogs() const { return true; } // Setup DB identity file, and write DB ID to manifest if necessary. - Status SetupDBId(bool read_only, RecoveryContext* recovery_ctx); + Status SetupDBId(const WriteOptions& write_options, bool read_only, + RecoveryContext* recovery_ctx); // Assign db_id_ and write DB ID to manifest if necessary. void SetDBId(std::string&& id, bool read_only, RecoveryContext* recovery_ctx); @@ -1694,7 +1712,8 @@ class DBImpl : public DB { return w; } Status ClearWriter() { - Status s = writer->WriteBuffer(); + // TODO: plumb Env::IOActivity, Env::IOPriority + Status s = writer->WriteBuffer(WriteOptions()); delete writer; writer = nullptr; return s; @@ -1870,12 +1889,15 @@ class DBImpl : public DB { const Status CreateArchivalDirectory(); // Create a column family, without some of the follow-up work yet - Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, + Status CreateColumnFamilyImpl(const ReadOptions& read_options, + const WriteOptions& write_options, + const ColumnFamilyOptions& cf_options, const std::string& cf_name, ColumnFamilyHandle** handle); // Follow-up work to user creating a column family or (families) Status WrapUpCreateColumnFamilies( + const ReadOptions& read_options, const WriteOptions& write_options, const std::vector& cf_options); Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family); @@ -1907,7 +1929,8 @@ class DBImpl : public DB { void ReleaseFileNumberFromPendingOutputs( std::unique_ptr::iterator>& v); - IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals, + IOStatus SyncClosedLogs(const WriteOptions& write_options, + JobContext* job_context, VersionEdit* synced_wals, bool error_recovery_in_prog); // Flush the in-memory write buffer to storage. Switches to a new @@ -2109,12 +2132,10 @@ class DBImpl : public DB { WriteBatch* tmp_batch, WriteBatch** merged_batch, size_t* write_with_wal, WriteBatch** to_be_cached_state); - // rate_limiter_priority is used to charge `DBOptions::rate_limiter` - // for automatic WAL flush (`Options::manual_wal_flush` == false) - // associated with this WriteToWAL - IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer, - uint64_t* log_used, uint64_t* log_size, - Env::IOPriority rate_limiter_priority, + IOStatus WriteToWAL(const WriteBatch& merged_batch, + const WriteOptions& write_options, + log::Writer* log_writer, uint64_t* log_used, + uint64_t* log_size, LogFileNumberSize& log_file_number_size); IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group, @@ -2226,7 +2247,9 @@ class DBImpl : public DB { // Cancel scheduled periodic tasks Status CancelPeriodicTaskScheduler(); - Status RegisterRecordSeqnoTimeWorker(bool is_new_db); + Status RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options, + const WriteOptions& write_options, + bool is_new_db); void PrintStatistics(); @@ -2254,7 +2277,9 @@ class DBImpl : public DB { // helper function to call after some of the logs_ were synced void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit); - Status ApplyWALToManifest(const ReadOptions& read_options, VersionEdit* edit); + Status ApplyWALToManifest(const ReadOptions& read_options, + const WriteOptions& write_options, + VersionEdit* edit); // WALs with log number up to up_to are not synced successfully. void MarkLogsNotSynced(uint64_t up_to); @@ -2326,8 +2351,9 @@ class DBImpl : public DB { size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; } - IOStatus CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, - size_t preallocate_block_size, log::Writer** new_log); + IOStatus CreateWAL(const WriteOptions& write_options, uint64_t log_file_num, + uint64_t recycle_log_number, size_t preallocate_block_size, + log::Writer** new_log); // Validate self-consistency of DB options static Status ValidateOptions(const DBOptions& db_options); @@ -2867,17 +2893,16 @@ class GetWithTimestampReadCallback : public ReadCallback { } }; -extern Options SanitizeOptions(const std::string& db, const Options& src, - bool read_only = false, - Status* logger_creation_s = nullptr); +Options SanitizeOptions(const std::string& db, const Options& src, + bool read_only = false, + Status* logger_creation_s = nullptr); -extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src, - bool read_only = false, - Status* logger_creation_s = nullptr); +DBOptions SanitizeOptions(const std::string& db, const DBOptions& src, + bool read_only = false, + Status* logger_creation_s = nullptr); -extern CompressionType GetCompressionFlush( - const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options); +CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options); // Return the earliest log file to keep after the memtable flush is // finalized. @@ -2885,13 +2910,13 @@ extern CompressionType GetCompressionFlush( // `memtables_to_flush`) will be flushed and thus will not depend on any WAL // file. // The function is only applicable to 2pc mode. -extern uint64_t PrecomputeMinLogNumberToKeep2PC( +uint64_t PrecomputeMinLogNumberToKeep2PC( VersionSet* vset, const ColumnFamilyData& cfd_to_flush, const autovector& edit_list, const autovector& memtables_to_flush, LogsWithPrepTracker* prep_tracker); // For atomic flush. -extern uint64_t PrecomputeMinLogNumberToKeep2PC( +uint64_t PrecomputeMinLogNumberToKeep2PC( VersionSet* vset, const autovector& cfds_to_flush, const autovector>& edit_lists, const autovector*>& memtables_to_flush, @@ -2899,21 +2924,21 @@ extern uint64_t PrecomputeMinLogNumberToKeep2PC( // In non-2PC mode, WALs with log number < the returned number can be // deleted after the cfd_to_flush column family is flushed successfully. -extern uint64_t PrecomputeMinLogNumberToKeepNon2PC( +uint64_t PrecomputeMinLogNumberToKeepNon2PC( VersionSet* vset, const ColumnFamilyData& cfd_to_flush, const autovector& edit_list); // For atomic flush. -extern uint64_t PrecomputeMinLogNumberToKeepNon2PC( +uint64_t PrecomputeMinLogNumberToKeepNon2PC( VersionSet* vset, const autovector& cfds_to_flush, const autovector>& edit_lists); // `cfd_to_flush` is the column family whose memtable will be flushed and thus // will not depend on any WAL file. nullptr means no memtable is being flushed. // The function is only applicable to 2pc mode. -extern uint64_t FindMinPrepLogReferencedByMemTable( +uint64_t FindMinPrepLogReferencedByMemTable( VersionSet* vset, const autovector& memtables_to_flush); // For atomic flush. -extern uint64_t FindMinPrepLogReferencedByMemTable( +uint64_t FindMinPrepLogReferencedByMemTable( VersionSet* vset, const autovector*>& memtables_to_flush); diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 36a67c82864..1db619f86f6 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -20,6 +20,10 @@ #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" +#include "rocksdb/file_system.h" +#include "rocksdb/io_status.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" #include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/coding.h" @@ -113,7 +117,8 @@ bool DBImpl::ShouldRescheduleFlushRequestToRetainUDT( return true; } -IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, +IOStatus DBImpl::SyncClosedLogs(const WriteOptions& write_options, + JobContext* job_context, VersionEdit* synced_wals, bool error_recovery_in_prog) { TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start"); @@ -144,7 +149,13 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, if (error_recovery_in_prog) { log->file()->reset_seen_error(); } - io_s = log->file()->Sync(immutable_db_options_.use_fsync); + + IOOptions io_options; + io_s = WritableFileWriter::PrepareIOOptions(write_options, io_options); + if (!io_s.ok()) { + break; + } + io_s = log->file()->Sync(io_options, immutable_db_options_.use_fsync); if (!io_s.ok()) { break; } @@ -153,16 +164,23 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, if (error_recovery_in_prog) { log->file()->reset_seen_error(); } - io_s = log->Close(); + // Normally the log file is closed when purging obsolete file, but if + // log recycling is enabled, the log file is closed here so that it + // can be reused. + io_s = log->Close(write_options); if (!io_s.ok()) { break; } } } if (io_s.ok()) { - io_s = directories_.GetWalDir()->FsyncWithDirOptions( - IOOptions(), nullptr, - DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + IOOptions io_options; + io_s = WritableFileWriter::PrepareIOOptions(write_options, io_options); + if (io_s.ok()) { + io_s = directories_.GetWalDir()->FsyncWithDirOptions( + io_options, nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } } TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock", @@ -200,6 +218,8 @@ Status DBImpl::FlushMemTableToOutputFile( assert(cfd->imm()->IsFlushPending()); assert(versions_); assert(versions_->GetColumnFamilySet()); + const ReadOptions read_options(Env::IOActivity::kFlush); + const WriteOptions write_options(Env::IOActivity::kFlush); // If there are more than one column families, we need to make sure that // all the log files except the most recent one are synced. Otherwise if // the host crashes after flushing and before WAL is persistent, the @@ -253,8 +273,8 @@ Status DBImpl::FlushMemTableToOutputFile( GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, true /* sync_output_directory */, true /* write_manifest */, thread_pri, - io_tracer_, seqno_to_time_mapping_, db_id_, db_session_id_, - cfd->GetFullHistoryTsLow(), &blob_callback_); + io_tracer_, cfd->GetSuperVersion()->ShareSeqnoToTimeMapping(), db_id_, + db_session_id_, cfd->GetFullHistoryTsLow(), &blob_callback_); FileMetaData file_meta; Status s; @@ -266,13 +286,12 @@ Status DBImpl::FlushMemTableToOutputFile( VersionEdit synced_wals; bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress(); mutex_.Unlock(); - log_io_s = - SyncClosedLogs(job_context, &synced_wals, error_recovery_in_prog); + log_io_s = SyncClosedLogs(write_options, job_context, &synced_wals, + error_recovery_in_prog); mutex_.Lock(); if (log_io_s.ok() && synced_wals.IsWalAddition()) { - const ReadOptions read_options(Env::IOActivity::kFlush); - log_io_s = - status_to_io_status(ApplyWALToManifest(read_options, &synced_wals)); + log_io_s = status_to_io_status( + ApplyWALToManifest(read_options, write_options, &synced_wals)); TEST_SYNC_POINT_CALLBACK("DBImpl::FlushMemTableToOutputFile:CommitWal:1", nullptr); } @@ -466,6 +485,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( const autovector& bg_flush_args, bool* made_progress, JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) { mutex_.AssertHeld(); + const ReadOptions read_options(Env::IOActivity::kFlush); + const WriteOptions write_options(Env::IOActivity::kFlush); autovector cfds; for (const auto& arg : bg_flush_args) { @@ -527,8 +548,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, false /* sync_output_directory */, false /* write_manifest */, - thread_pri, io_tracer_, seqno_to_time_mapping_, db_id_, db_session_id_, - cfd->GetFullHistoryTsLow(), &blob_callback_)); + thread_pri, io_tracer_, + cfd->GetSuperVersion()->ShareSeqnoToTimeMapping(), db_id_, + db_session_id_, cfd->GetFullHistoryTsLow(), &blob_callback_)); } std::vector file_meta(num_cfs); @@ -553,13 +575,12 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( VersionEdit synced_wals; bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress(); mutex_.Unlock(); - log_io_s = - SyncClosedLogs(job_context, &synced_wals, error_recovery_in_prog); + log_io_s = SyncClosedLogs(write_options, job_context, &synced_wals, + error_recovery_in_prog); mutex_.Lock(); if (log_io_s.ok() && synced_wals.IsWalAddition()) { - const ReadOptions read_options(Env::IOActivity::kFlush); - log_io_s = - status_to_io_status(ApplyWALToManifest(read_options, &synced_wals)); + log_io_s = status_to_io_status( + ApplyWALToManifest(read_options, write_options, &synced_wals)); } if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && @@ -654,9 +675,14 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( // Sync on all distinct output directories. for (auto dir : distinct_output_dirs) { if (dir != nullptr) { - Status error_status = dir->FsyncWithDirOptions( - IOOptions(), nullptr, - DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + IOOptions io_options; + Status error_status = + WritableFileWriter::PrepareIOOptions(write_options, io_options); + if (error_status.ok()) { + error_status = dir->FsyncWithDirOptions( + io_options, nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } if (!error_status.ok()) { s = error_status; break; @@ -948,7 +974,7 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, info.smallest_seqno = file_meta->fd.smallest_seqno; info.largest_seqno = file_meta->fd.largest_seqno; info.flush_reason = flush_reason; - for (auto listener : immutable_db_options_.listeners) { + for (const auto& listener : immutable_db_options_.listeners) { listener->OnFlushBegin(this, info); } } @@ -980,7 +1006,7 @@ void DBImpl::NotifyOnFlushCompleted( for (auto& info : *flush_jobs_info) { info->triggered_writes_slowdown = triggered_writes_slowdown; info->triggered_writes_stop = triggered_writes_stop; - for (auto listener : immutable_db_options_.listeners) { + for (const auto& listener : immutable_db_options_.listeners) { listener->OnFlushCompleted(this, *info); } TEST_SYNC_POINT( @@ -1050,8 +1076,10 @@ Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, edit.SetColumnFamily(cfd->GetID()); edit.SetFullHistoryTsLow(ts_low); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + TEST_SYNC_POINT_CALLBACK("DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit", &edit); @@ -1065,7 +1093,7 @@ Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, } Status s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - read_options, &edit, &mutex_, + read_options, write_options, &edit, &mutex_, directories_.GetDbDir()); if (!s.ok()) { return s; @@ -1116,7 +1144,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, // TODO(ajkr): We could also optimize away the flush in certain cases where // one/both sides of the interval are unbounded. But it requires more // changes to RangesOverlapWithMemtables. - Range range(*begin, *end); + UserKeyRange range(*begin, *end); SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); s = cfd->RangesOverlapWithMemtables( {range}, super_version, immutable_db_options_.allow_data_in_errors, @@ -1395,10 +1423,9 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options, // Perform CompactFiles TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2"); - TEST_SYNC_POINT_CALLBACK( - "TestCompactFiles:PausingManualCompaction:3", - reinterpret_cast( - const_cast*>(&manual_compaction_paused_))); + TEST_SYNC_POINT_CALLBACK("TestCompactFiles:PausingManualCompaction:3", + static_cast(const_cast*>( + &manual_compaction_paused_))); { InstrumentedMutexLock l(&mutex_); auto* current = cfd->current(); @@ -1586,9 +1613,9 @@ Status DBImpl::CompactFilesImpl( } if (status.ok()) { assert(compaction_job.io_status().ok()); - InstallSuperVersionAndScheduleWork(c->column_family_data(), - &job_context->superversion_contexts[0], - *c->mutable_cf_options()); + InstallSuperVersionAndScheduleWork( + c->column_family_data(), job_context->superversion_contexts.data(), + *c->mutable_cf_options()); } // status above captures any error during compaction_job.Install, so its ok // not check compaction_job.io_status() explicitly if we're not calling @@ -1671,7 +1698,7 @@ Status DBImpl::PauseBackgroundWork() { Status DBImpl::ContinueBackgroundWork() { InstrumentedMutexLock guard_lock(&mutex_); if (bg_work_paused_ == 0) { - return Status::InvalidArgument(); + return Status::InvalidArgument("Background work already unpaused"); } assert(bg_work_paused_ > 0); assert(bg_compaction_paused_ > 0); @@ -1708,7 +1735,7 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, { CompactionJobInfo info{}; BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, &info); - for (auto listener : immutable_db_options_.listeners) { + for (const auto& listener : immutable_db_options_.listeners) { listener->OnCompactionBegin(this, info); } info.status.PermitUncheckedError(); @@ -1737,7 +1764,7 @@ void DBImpl::NotifyOnCompactionCompleted( { CompactionJobInfo info{}; BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, &info); - for (auto listener : immutable_db_options_.listeners) { + for (const auto& listener : immutable_db_options_.listeners) { listener->OnCompactionCompleted(this, info); } } @@ -1755,6 +1782,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { } const ReadOptions read_options(Env::IOActivity::kCompaction); + const WriteOptions write_options(Env::IOActivity::kCompaction); SuperVersionContext sv_context(/* create_superversion */ true); @@ -1844,7 +1872,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { , LLONG_MAX /* max compaction bytes, not applicable */, 0 /* output path ID, not applicable */, mutable_cf_options.compression, - mutable_cf_options.compression_opts, Temperature::kUnknown, + mutable_cf_options.compression_opts, + mutable_cf_options.default_write_temperature, 0 /* max_subcompactions, not applicable */, {} /* grandparents, not applicable */, false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */, @@ -1871,9 +1900,9 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), edit.DebugString().data()); - Status status = - versions_->LogAndApply(cfd, mutable_cf_options, read_options, &edit, - &mutex_, directories_.GetDbDir()); + Status status = versions_->LogAndApply(cfd, mutable_cf_options, + read_options, write_options, &edit, + &mutex_, directories_.GetDbDir()); cfd->compaction_picker()->UnregisterCompaction(c.get()); c.reset(); @@ -3051,8 +3080,8 @@ void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync, } void DBImpl::BGWorkFlush(void* arg) { - FlushThreadArg fta = *(reinterpret_cast(arg)); - delete reinterpret_cast(arg); + FlushThreadArg fta = *(static_cast(arg)); + delete static_cast(arg); IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_); TEST_SYNC_POINT("DBImpl::BGWorkFlush"); @@ -3061,8 +3090,8 @@ void DBImpl::BGWorkFlush(void* arg) { } void DBImpl::BGWorkCompaction(void* arg) { - CompactionArg ca = *(reinterpret_cast(arg)); - delete reinterpret_cast(arg); + CompactionArg ca = *(static_cast(arg)); + delete static_cast(arg); IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW); TEST_SYNC_POINT("DBImpl::BGWorkCompaction"); auto prepicked_compaction = @@ -3086,12 +3115,12 @@ void DBImpl::BGWorkBottomCompaction(void* arg) { void DBImpl::BGWorkPurge(void* db) { IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); TEST_SYNC_POINT("DBImpl::BGWorkPurge:start"); - reinterpret_cast(db)->BackgroundCallPurge(); + static_cast(db)->BackgroundCallPurge(); TEST_SYNC_POINT("DBImpl::BGWorkPurge:end"); } void DBImpl::UnscheduleCompactionCallback(void* arg) { - CompactionArg* ca_ptr = reinterpret_cast(arg); + CompactionArg* ca_ptr = static_cast(arg); Env::Priority compaction_pri = ca_ptr->compaction_pri_; if (Env::Priority::BOTTOM == compaction_pri) { // Decrement bg_bottom_compaction_scheduled_ if priority is BOTTOM @@ -3101,7 +3130,7 @@ void DBImpl::UnscheduleCompactionCallback(void* arg) { ca_ptr->db->bg_compaction_scheduled_--; } CompactionArg ca = *(ca_ptr); - delete reinterpret_cast(arg); + delete static_cast(arg); if (ca.prepicked_compaction != nullptr) { // if it's a manual compaction, set status to ManualCompactionPaused if (ca.prepicked_compaction->manual_compaction_state) { @@ -3121,14 +3150,14 @@ void DBImpl::UnscheduleCompactionCallback(void* arg) { void DBImpl::UnscheduleFlushCallback(void* arg) { // Decrement bg_flush_scheduled_ in flush callback - reinterpret_cast(arg)->db_->bg_flush_scheduled_--; - Env::Priority flush_pri = reinterpret_cast(arg)->thread_pri_; + static_cast(arg)->db_->bg_flush_scheduled_--; + Env::Priority flush_pri = static_cast(arg)->thread_pri_; if (Env::Priority::LOW == flush_pri) { TEST_SYNC_POINT("DBImpl::UnscheduleLowFlushCallback"); } else if (Env::Priority::HIGH == flush_pri) { TEST_SYNC_POINT("DBImpl::UnscheduleHighFlushCallback"); } - delete reinterpret_cast(arg); + delete static_cast(arg); TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback"); } @@ -3227,7 +3256,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, column_families_not_to_flush.push_back(cfd); continue; } - superversion_contexts.emplace_back(SuperVersionContext(true)); + superversion_contexts.emplace_back(true); bg_flush_args.emplace_back(cfd, max_memtable_id, &(superversion_contexts.back()), flush_reason); } @@ -3333,7 +3362,7 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0"); ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); - // There is no need to do these clean up if the flush job is rescheduled + // There is no need to find obsolete files if the flush job is rescheduled // to retain user-defined timestamps because the job doesn't get to the // stage of actually flushing the MemTables. if (!flush_rescheduled_to_retain_udt) { @@ -3341,25 +3370,25 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { // have created. Thus, we force full scan in FindObsoleteFiles() FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()); - // delete unnecessary files if any, this is done outside the mutex - if (job_context.HaveSomethingToClean() || - job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { - mutex_.Unlock(); - TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound"); - // Have to flush the info logs before bg_flush_scheduled_-- - // because if bg_flush_scheduled_ becomes 0 and the lock is - // released, the deconstructor of DB can kick in and destroy all the - // states of DB so info_log might not be available after that point. - // It also applies to access other states that DB owns. - log_buffer.FlushBufferToLog(); - if (job_context.HaveSomethingToDelete()) { - PurgeObsoleteFiles(job_context); - } - job_context.Clean(); - mutex_.Lock(); + } + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound"); + // Have to flush the info logs before bg_flush_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); } - TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp"); + job_context.Clean(); + mutex_.Lock(); } + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp"); assert(num_running_flushes_ > 0); num_running_flushes_--; @@ -3512,6 +3541,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); const ReadOptions read_options(Env::IOActivity::kCompaction); + const WriteOptions write_options(Env::IOActivity::kCompaction); bool is_manual = (manual_compaction != nullptr); std::unique_ptr c; @@ -3724,16 +3754,16 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } status = versions_->LogAndApply( c->column_family_data(), *c->mutable_cf_options(), read_options, - c->edit(), &mutex_, directories_.GetDbDir(), + write_options, c->edit(), &mutex_, directories_.GetDbDir(), /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, [&c, &compaction_released](const Status& s) { c->ReleaseCompactionFiles(s); compaction_released = true; }); io_s = versions_->io_status(); - InstallSuperVersionAndScheduleWork(c->column_family_data(), - &job_context->superversion_contexts[0], - *c->mutable_cf_options()); + InstallSuperVersionAndScheduleWork( + c->column_family_data(), job_context->superversion_contexts.data(), + *c->mutable_cf_options()); ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n", c->column_family_data()->GetName().c_str(), c->num_input_files(0)); @@ -3798,7 +3828,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } status = versions_->LogAndApply( c->column_family_data(), *c->mutable_cf_options(), read_options, - c->edit(), &mutex_, directories_.GetDbDir(), + write_options, c->edit(), &mutex_, directories_.GetDbDir(), /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, [&c, &compaction_released](const Status& s) { c->ReleaseCompactionFiles(s); @@ -3806,9 +3836,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, }); io_s = versions_->io_status(); // Use latest MutableCFOptions - InstallSuperVersionAndScheduleWork(c->column_family_data(), - &job_context->superversion_contexts[0], - *c->mutable_cf_options()); + InstallSuperVersionAndScheduleWork( + c->column_family_data(), job_context->superversion_contexts.data(), + *c->mutable_cf_options()); VersionStorageInfo::LevelSummaryStorage tmp; c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(), @@ -3902,9 +3932,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, compaction_job.Install(*c->mutable_cf_options(), &compaction_released); io_s = compaction_job.io_status(); if (status.ok()) { - InstallSuperVersionAndScheduleWork(c->column_family_data(), - &job_context->superversion_contexts[0], - *c->mutable_cf_options()); + InstallSuperVersionAndScheduleWork( + c->column_family_data(), job_context->superversion_contexts.data(), + *c->mutable_cf_options()); } *made_progress = true; TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", @@ -4051,7 +4081,6 @@ void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) { ++it; } assert(false); - return; } bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) { diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index 2588fd8eb82..aa1454b3bfa 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -216,11 +216,11 @@ void DBImpl::TEST_SignalAllBgCv() { bg_cv_.SignalAll(); } void* DBImpl::TEST_BeginWrite() { auto w = new WriteThread::Writer(); write_thread_.EnterUnbatched(w, &mutex_); - return reinterpret_cast(w); + return static_cast(w); } void DBImpl::TEST_EndWrite(void* w) { - auto writer = reinterpret_cast(w); + auto writer = static_cast(w); write_thread_.ExitUnbatched(writer); delete writer; } diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index 442cb47679d..113a7f42ff4 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -61,8 +61,10 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { "PromoteL0 FAILED. Invalid target level %d\n", target_level); return Status::InvalidArgument("Invalid target level"); } - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + Status status; VersionEdit edit; JobContext job_context(next_job_id_.fetch_add(1), true); @@ -102,7 +104,9 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { return status; } - if (i == 0) continue; + if (i == 0) { + continue; + } auto prev_f = l0_files[i - 1]; if (icmp->Compare(prev_f->largest, f->smallest) >= 0) { ROCKS_LOG_INFO(immutable_db_options_.info_log, @@ -143,12 +147,12 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - read_options, &edit, &mutex_, + read_options, write_options, &edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { - InstallSuperVersionAndScheduleWork(cfd, - &job_context.superversion_contexts[0], - *cfd->GetLatestMutableCFOptions()); + InstallSuperVersionAndScheduleWork( + cfd, job_context.superversion_contexts.data(), + *cfd->GetLatestMutableCFOptions()); } } // lock released here LogFlush(immutable_db_options_.info_log); diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index 3887e9a7940..d235014d1ad 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -18,6 +18,7 @@ #include "file/sst_file_manager_impl.h" #include "logging/logging.h" #include "port/port.h" +#include "rocksdb/options.h" #include "util/autovector.h" #include "util/defer.h" @@ -51,7 +52,7 @@ Status DBImpl::DisableFileDeletions() { if (my_disable_delete_obsolete_files == 1) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled"); } else { - ROCKS_LOG_WARN(immutable_db_options_.info_log, + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled, but already disabled. Counter: %d", my_disable_delete_obsolete_files); } @@ -66,17 +67,14 @@ Status DBImpl::DisableFileDeletionsWithLock() { return Status::OK(); } -Status DBImpl::EnableFileDeletions(bool force) { +Status DBImpl::EnableFileDeletions() { // Job id == 0 means that this is not our background process, but rather // user thread JobContext job_context(0); int saved_counter; // initialize on all paths { InstrumentedMutexLock l(&mutex_); - if (force) { - // if force, we need to enable file deletions right away - disable_delete_obsolete_files_ = 0; - } else if (disable_delete_obsolete_files_ > 0) { + if (disable_delete_obsolete_files_ > 0) { --disable_delete_obsolete_files_; } saved_counter = disable_delete_obsolete_files_; @@ -91,7 +89,7 @@ Status DBImpl::EnableFileDeletions(bool force) { PurgeObsoleteFiles(job_context); } } else { - ROCKS_LOG_WARN(immutable_db_options_.info_log, + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enable, but not really enabled. Counter: %d", saved_counter); } @@ -512,7 +510,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { // Close WALs before trying to delete them. for (const auto w : state.logs_to_free) { // TODO: maybe check the return value of Close. - auto s = w->Close(); + // TODO: plumb Env::IOActivity, Env::IOPriority + auto s = w->Close(WriteOptions()); s.PermitUncheckedError(); } @@ -927,7 +926,8 @@ void DBImpl::SetDBId(std::string&& id, bool read_only, } } -Status DBImpl::SetupDBId(bool read_only, RecoveryContext* recovery_ctx) { +Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only, + RecoveryContext* recovery_ctx) { Status s; // Check for the IDENTITY file and create it if not there or // broken or not matching manifest @@ -960,7 +960,7 @@ Status DBImpl::SetupDBId(bool read_only, RecoveryContext* recovery_ctx) { } // Persist it to IDENTITY file if allowed if (!read_only) { - s = SetIdentityFile(env_, dbname_, db_id_); + s = SetIdentityFile(write_options, env_, dbname_, db_id_); } return s; } diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index fac365d2261..640c23d9e87 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -21,6 +21,7 @@ #include "monitoring/persistent_stats_history.h" #include "monitoring/thread_status_util.h" #include "options/options_helper.h" +#include "rocksdb/options.h" #include "rocksdb/table.h" #include "rocksdb/wal_filter.h" #include "test_util/sync_point.h" @@ -103,7 +104,6 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, if (result.recycle_log_file_num && (result.wal_recovery_mode == WALRecoveryMode::kTolerateCorruptedTailRecords || - result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery || result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) { // - kTolerateCorruptedTailRecords is inconsistent with recycle log file // feature. WAL recycling expects recovery success upon encountering a @@ -280,11 +280,6 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) { "atomic_flush is incompatible with enable_pipelined_write"); } - // TODO remove this restriction - if (db_options.atomic_flush && db_options.best_efforts_recovery) { - return Status::InvalidArgument( - "atomic_flush is currently incompatible with best-efforts recovery"); - } if (db_options.replication_log_listener && !db_options.atomic_flush) { return Status::InvalidArgument( @@ -314,7 +309,8 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) { Status DBImpl::NewDB(std::vector* new_filenames) { VersionEdit new_db; - Status s = SetIdentityFile(env_, dbname_); + const WriteOptions write_options(Env::IOActivity::kDBOpen); + Status s = SetIdentityFile(write_options, env_, dbname_); if (!s.ok()) { return s; } @@ -344,20 +340,23 @@ Status DBImpl::NewDB(std::vector* new_filenames) { immutable_db_options_.manifest_preallocation_size); std::unique_ptr file_writer(new WritableFileWriter( std::move(file), manifest, file_options, immutable_db_options_.clock, - io_tracer_, nullptr /* stats */, immutable_db_options_.listeners, - nullptr, tmp_set.Contains(FileType::kDescriptorFile), + io_tracer_, nullptr /* stats */, + Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, + immutable_db_options_.listeners, nullptr, + tmp_set.Contains(FileType::kDescriptorFile), tmp_set.Contains(FileType::kDescriptorFile))); log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); - s = log.AddRecord(record); + s = log.AddRecord(write_options, record); if (s.ok()) { - s = SyncManifest(&immutable_db_options_, log.file()); + s = SyncManifest(&immutable_db_options_, write_options, log.file()); } } if (s.ok()) { // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(fs_.get(), dbname_, 1, directories_.GetDbDir()); + s = SetCurrentFile(write_options, fs_.get(), dbname_, 1, + directories_.GetDbDir()); if (new_filenames) { new_filenames->emplace_back( manifest.substr(manifest.find_last_of("/\\") + 1)); @@ -423,6 +422,7 @@ Status DBImpl::Recover( uint64_t* recovered_seq, RecoveryContext* recovery_ctx) { mutex_.AssertHeld(); + const WriteOptions write_options(Env::IOActivity::kDBOpen); bool tmp_is_new_db = false; bool& is_new_db = recovery_ctx ? recovery_ctx->is_new_db_ : tmp_is_new_db; assert(db_lock_ == nullptr); @@ -649,7 +649,7 @@ Status DBImpl::Recover( } } } - s = SetupDBId(read_only, recovery_ctx); + s = SetupDBId(write_options, read_only, recovery_ctx); ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str()); #if 0 // RocksDB-Cloud skips DeleteUnreferencedSstFiles() for a couple of reasons: @@ -889,8 +889,9 @@ Status DBImpl::PersistentStatsProcessFormatVersion() { if (s.ok()) { ColumnFamilyOptions cfo; OptimizeForPersistentStats(&cfo); - s = CreateColumnFamilyImpl(cfo, kPersistentStatsColumnFamilyName, - &handle); + s = CreateColumnFamilyImpl(ReadOptions(Env::IOActivity::kDBOpen), + WriteOptions(Env::IOActivity::kDBOpen), cfo, + kPersistentStatsColumnFamilyName, &handle); } if (s.ok()) { persist_stats_cf_handle_ = static_cast(handle); @@ -912,6 +913,7 @@ Status DBImpl::PersistentStatsProcessFormatVersion() { std::to_string(kStatsCFCompatibleFormatVersion)); } if (s.ok()) { + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions wo; wo.low_pri = true; wo.no_slowdown = true; @@ -943,7 +945,9 @@ Status DBImpl::InitPersistStatsColumnFamily() { ColumnFamilyHandle* handle = nullptr; ColumnFamilyOptions cfo; OptimizeForPersistentStats(&cfo); - s = CreateColumnFamilyImpl(cfo, kPersistentStatsColumnFamilyName, &handle); + s = CreateColumnFamilyImpl(ReadOptions(Env::IOActivity::kDBOpen), + WriteOptions(Env::IOActivity::kDBOpen), cfo, + kPersistentStatsColumnFamilyName, &handle); persist_stats_cf_handle_ = static_cast(handle); mutex_.Lock(); } @@ -953,13 +957,18 @@ Status DBImpl::InitPersistStatsColumnFamily() { Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) { mutex_.AssertHeld(); assert(versions_->descriptor_log_ == nullptr); + + Status s; if (!recovery_ctx.edit_lists_.empty()) { const ReadOptions read_options(Env::IOActivity::kDBOpen); - s = versions_->LogAndApply( - recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_, read_options, - recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir()); + const WriteOptions write_options(Env::IOActivity::kDBOpen); + s = versions_->LogAndApply(recovery_ctx.cfds_, + recovery_ctx.mutable_cf_opts_, read_options, + write_options, recovery_ctx.edit_lists_, &mutex_, + directories_.GetDbDir()); } + if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) { mutex_.Unlock(); for (const auto& stale_sst_file : recovery_ctx.files_to_delete_) { @@ -1104,6 +1113,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, Logger* info_log; const char* fname; Status* status; // nullptr if immutable_db_options_.paranoid_checks==false + bool* old_log_record; void Corruption(size_t bytes, const Status& s) override { ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", (status == nullptr ? "(ignoring error) " : ""), fname, @@ -1112,10 +1122,19 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, *status = s; } } + + void OldLogRecord(size_t bytes) override { + if (old_log_record != nullptr) { + *old_log_record = true; + } + ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes; possibly recycled", + fname, static_cast(bytes)); + } }; mutex_.AssertHeld(); Status status; + bool old_log_record = false; std::unordered_map version_edits; // no need to refcount because iteration is under mutex for (auto cfd : *versions_->GetColumnFamilySet()) { @@ -1206,6 +1225,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, reporter.env = env_; reporter.info_log = immutable_db_options_.info_log.get(); reporter.fname = fname.c_str(); + reporter.old_log_record = &old_log_record; if (!immutable_db_options_.paranoid_checks || immutable_db_options_.wal_recovery_mode == WALRecoveryMode::kSkipAnyCorruptedRecords) { @@ -1353,7 +1373,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, } } - if (!status.ok()) { + if (!status.ok() || old_log_record) { if (status.IsNotSupported()) { // We should not treat NotSupported as corruption. It is rather a clear // sign that we are processing a WAL that is produced by an incompatible @@ -1378,6 +1398,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, } // We should ignore the error but not continue replaying status = Status::OK(); + old_log_record = false; stop_replay_for_corruption = true; corrupted_wal_number = wal_number; if (corrupted_wal_found != nullptr) { @@ -1650,7 +1671,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, Status s; TableProperties table_properties; { - ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + ScopedArenaPtr iter( + mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] [WriteLevel0TableForRecovery]" " Level-0 table #%" PRIu64 ": started", @@ -1692,28 +1714,28 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, } IOStatus io_s; + const ReadOptions read_option(Env::IOActivity::kDBOpen); + const WriteOptions write_option(Env::IO_HIGH, Env::IOActivity::kDBOpen); TableBuilderOptions tboptions( - *cfd->ioptions(), mutable_cf_options, cfd->internal_comparator(), - cfd->int_tbl_prop_collector_factories(), + *cfd->ioptions(), mutable_cf_options, read_option, write_option, + cfd->internal_comparator(), cfd->internal_tbl_prop_coll_factories(), GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(), 0 /* level */, false /* is_bottommost */, TableFileCreationReason::kRecovery, 0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_, db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber()); - SeqnoToTimeMapping empty_seqno_to_time_mapping; Version* version = cfd->current(); version->Ref(); - const ReadOptions read_option(Env::IOActivity::kDBOpen); uint64_t num_input_entries = 0; s = BuildTable( dbname_, versions_.get(), immutable_db_options_, tboptions, - file_options_for_compaction_, read_option, cfd->table_cache(), - iter.get(), std::move(range_del_iters), &meta, &blob_file_additions, + file_options_for_compaction_, cfd->table_cache(), iter.get(), + std::move(range_del_iters), &meta, &blob_file_additions, snapshot_seqs, earliest_write_conflict_snapshot, kMaxSequenceNumber, snapshot_checker, paranoid_file_checks, cfd->internal_stats(), &io_s, io_tracer_, BlobFileCreationReason::kRecovery, - empty_seqno_to_time_mapping, &event_logger_, job_id, Env::IO_HIGH, + nullptr /* seqno_to_time_mapping */, &event_logger_, job_id, nullptr /* table_properties */, write_hint, nullptr /*full_history_ts_low*/, &blob_callback_, version, &num_input_entries); @@ -1812,11 +1834,9 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { DBOptions db_options(options); ColumnFamilyOptions cf_options(options); std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); if (db_options.persist_stats_to_disk) { - column_families.push_back( - ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options)); + column_families.emplace_back(kPersistentStatsColumnFamilyName, cf_options); } std::vector handles; Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); @@ -1915,7 +1935,8 @@ Status DB::OpenAndTrimHistory( return s; } -IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, +IOStatus DBImpl::CreateWAL(const WriteOptions& write_options, + uint64_t log_file_num, uint64_t recycle_log_number, size_t preallocate_block_size, log::Writer** new_log) { IOStatus io_s; @@ -1949,14 +1970,15 @@ IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; std::unique_ptr file_writer(new WritableFileWriter( std::move(lfile), log_fname, opt_file_options, - immutable_db_options_.clock, io_tracer_, nullptr /* stats */, listeners, - nullptr, tmp_set.Contains(FileType::kWalFile), + immutable_db_options_.clock, io_tracer_, nullptr /* stats */, + Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, listeners, nullptr, + tmp_set.Contains(FileType::kWalFile), tmp_set.Contains(FileType::kWalFile))); *new_log = new log::Writer(std::move(file_writer), log_file_num, immutable_db_options_.recycle_log_file_num > 0, immutable_db_options_.manual_wal_flush, immutable_db_options_.wal_compression); - io_s = (*new_log)->AddCompressionTypeRecord(); + io_s = (*new_log)->AddCompressionTypeRecord(write_options); } return io_s; } @@ -1965,6 +1987,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, const std::vector& column_families, std::vector* handles, DB** dbptr, const bool seq_per_batch, const bool batch_per_txn) { + const WriteOptions write_options(Env::IOActivity::kDBOpen); + const ReadOptions read_options(Env::IOActivity::kDBOpen); + Status s = ValidateOptionsByTable(db_options, column_families); if (!s.ok()) { return s; @@ -1980,7 +2005,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, handles->clear(); size_t max_write_buffer_size = 0; - for (auto cf : column_families) { + for (const auto& cf : column_families) { max_write_buffer_size = std::max(max_write_buffer_size, cf.options.write_buffer_size); } @@ -2041,7 +2066,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, log::Writer* new_log = nullptr; const size_t preallocate_block_size = impl->GetWalPreallocateBlockSize(max_write_buffer_size); - s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/, + s = impl->CreateWAL(write_options, new_log_number, 0 /*recycle_log_number*/, preallocate_block_size, &new_log); if (s.ok()) { InstrumentedMutexLock wl(&impl->log_write_mutex_); @@ -2052,8 +2077,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, } if (s.ok()) { - impl->alive_log_files_.push_back( - DBImpl::LogFileNumberSize(impl->logfile_number_)); + impl->alive_log_files_.emplace_back(impl->logfile_number_); // In WritePrepared there could be gap in sequence numbers. This breaks // the trick we use in kPointInTimeRecovery which assumes the first seq in // the log right after the corrupted log is one larger than the last seq @@ -2066,21 +2090,25 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, if (recovered_seq != kMaxSequenceNumber) { WriteBatch empty_batch; WriteBatchInternal::SetSequence(&empty_batch, recovered_seq); - WriteOptions write_options; uint64_t log_used, log_size; log::Writer* log_writer = impl->logs_.back().writer; LogFileNumberSize& log_file_number_size = impl->alive_log_files_.back(); assert(log_writer->get_log_number() == log_file_number_size.number); impl->mutex_.AssertHeld(); - s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size, - Env::IO_TOTAL, log_file_number_size); + s = impl->WriteToWAL(empty_batch, write_options, log_writer, &log_used, + &log_size, log_file_number_size); if (s.ok()) { // Need to fsync, otherwise it might get lost after a power reset. - s = impl->FlushWAL(false); + s = impl->FlushWAL(write_options, false); TEST_SYNC_POINT_CALLBACK("DBImpl::Open::BeforeSyncWAL", /*arg=*/&s); + IOOptions opts; if (s.ok()) { - s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync); + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + } + if (s.ok()) { + s = log_writer->file()->Sync(opts, + impl->immutable_db_options_.use_fsync); } } } @@ -2097,7 +2125,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, if (s.ok()) { // set column family handles - for (auto cf : column_families) { + for (const auto& cf : column_families) { auto cfd = impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); if (cfd != nullptr) { @@ -2111,7 +2139,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, impl->mutex_.Unlock(); // NOTE: the work normally done in WrapUpCreateColumnFamilies will // be done separately below. - s = impl->CreateColumnFamilyImpl(cf.options, cf.name, &handle); + s = impl->CreateColumnFamilyImpl(read_options, write_options, + cf.options, cf.name, &handle); impl->mutex_.Lock(); if (s.ok()) { handles->push_back(handle); @@ -2163,7 +2192,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, // Persist RocksDB Options before scheduling the compaction. // The WriteOptionsFile() will release and lock the mutex internally. persist_options_status = - impl->WriteOptionsFile(true /*db_mutex_already_held*/); + impl->WriteOptionsFile(write_options, true /*db_mutex_already_held*/); *dbptr = impl; impl->opened_successfully_ = true; impl->DeleteObsoleteFiles(); @@ -2263,12 +2292,17 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, impl); LogFlush(impl->immutable_db_options_.info_log); if (!impl->WALBufferIsEmpty()) { - s = impl->FlushWAL(false); + s = impl->FlushWAL(write_options, false); if (s.ok()) { // Sync is needed otherwise WAL buffered data might get lost after a // power reset. log::Writer* log_writer = impl->logs_.back().writer; - s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync); + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + s = log_writer->file()->Sync(opts, + impl->immutable_db_options_.use_fsync); + } } } if (s.ok() && !persist_options_status.ok()) { @@ -2285,7 +2319,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, s = impl->StartPeriodicTaskScheduler(); } if (s.ok()) { - s = impl->RegisterRecordSeqnoTimeWorker(recovery_ctx.is_new_db_); + s = impl->RegisterRecordSeqnoTimeWorker(read_options, write_options, + recovery_ctx.is_new_db_); } impl->options_mutex_.Unlock(); if (!s.ok()) { diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc index 997a4e2edf1..e0d8d3b31ac 100644 --- a/db/db_impl/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -26,7 +26,7 @@ DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, LogFlush(immutable_db_options_.info_log); } -DBImplReadOnly::~DBImplReadOnly() {} +DBImplReadOnly::~DBImplReadOnly() = default; // Implementations of the DB interface Status DBImplReadOnly::GetImpl(const ReadOptions& read_options, @@ -169,8 +169,7 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& _read_options, SequenceNumber latest_snapshot = versions_->LastSequence(); SequenceNumber read_seq = read_options.snapshot != nullptr - ? reinterpret_cast(read_options.snapshot) - ->number_ + ? static_cast(read_options.snapshot)->number_ : latest_snapshot; ReadCallback* read_callback = nullptr; // No read callback provided. auto db_iter = NewArenaWrappedDbIterator( @@ -216,8 +215,7 @@ Status DBImplReadOnly::NewIterators( SequenceNumber latest_snapshot = versions_->LastSequence(); SequenceNumber read_seq = read_options.snapshot != nullptr - ? reinterpret_cast(read_options.snapshot) - ->number_ + ? static_cast(read_options.snapshot)->number_ : latest_snapshot; autovector> cfd_to_sv; @@ -295,8 +293,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, DBOptions db_options(options); ColumnFamilyOptions cf_options(options); std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); std::vector handles; s = DBImplReadOnly::OpenForReadOnlyWithoutCheck( @@ -341,7 +338,7 @@ Status DBImplReadOnly::OpenForReadOnlyWithoutCheck( error_if_wal_file_exists); if (s.ok()) { // set column family handles - for (auto cf : column_families) { + for (const auto& cf : column_families) { auto cfd = impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); if (cfd == nullptr) { diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h index 32bc8560706..7e39573a717 100644 --- a/db/db_impl/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -31,18 +31,17 @@ class DBImplReadOnly : public DBImpl { // TODO: Implement ReadOnly MultiGet? using DBImpl::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family) override; + Iterator* NewIterator(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family) override; - virtual Status NewIterators( - const ReadOptions& options, - const std::vector& column_families, - std::vector* iterators) override; + Status NewIterators(const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; using DBImpl::Put; - virtual Status Put(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/, const Slice& /*value*/) override { + Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { return Status::NotSupported("Not supported operation in read only mode."); } @@ -59,37 +58,36 @@ class DBImplReadOnly : public DBImpl { } using DBImpl::Merge; - virtual Status Merge(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/, const Slice& /*value*/) override { + Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::Delete; - virtual Status Delete(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/) override { + Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::SingleDelete; - virtual Status SingleDelete(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/) override { + Status SingleDelete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status Write(const WriteOptions& /*options*/, - WriteBatch* /*updates*/) override { + Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::CompactRange; - virtual Status CompactRange(const CompactRangeOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice* /*begin*/, - const Slice* /*end*/) override { + Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, const Slice* /*end*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::CompactFiles; - virtual Status CompactFiles( + Status CompactFiles( const CompactionOptions& /*compact_options*/, ColumnFamilyHandle* /*column_family*/, const std::vector& /*input_file_names*/, @@ -99,33 +97,33 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status DisableFileDeletions() override { + Status DisableFileDeletions() override { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status EnableFileDeletions(bool /*force*/) override { + Status EnableFileDeletions() override { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status GetLiveFiles(std::vector& ret, - uint64_t* manifest_file_size, - bool /*flush_memtable*/) override { + Status GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool /*flush_memtable*/) override { return DBImpl::GetLiveFiles(ret, manifest_file_size, false /* flush_memtable */); } using DBImpl::Flush; - virtual Status Flush(const FlushOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/) override { + Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::SyncWAL; - virtual Status SyncWAL() override { + Status SyncWAL() override { return Status::NotSupported("Not supported operation in read only mode."); } using DB::IngestExternalFile; - virtual Status IngestExternalFile( + Status IngestExternalFile( ColumnFamilyHandle* /*column_family*/, const std::vector& /*external_files*/, const IngestExternalFileOptions& /*ingestion_options*/) override { @@ -133,7 +131,7 @@ class DBImplReadOnly : public DBImpl { } using DB::CreateColumnFamilyWithImport; - virtual Status CreateColumnFamilyWithImport( + Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& /*options*/, const std::string& /*column_family_name*/, const ImportColumnFamilyOptions& /*import_options*/, @@ -142,7 +140,7 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status CreateColumnFamilyWithImport( + Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& /*options*/, const std::string& /*column_family_name*/, const ImportColumnFamilyOptions& /*import_options*/, @@ -152,9 +150,9 @@ class DBImplReadOnly : public DBImpl { } using DB::ClipColumnFamily; - virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, - const Slice& /*begin*/, - const Slice& /*end*/) override { + Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { return Status::NotSupported("Not supported operation in read only mode."); } diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 235a528ba08..f41884626e6 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -28,7 +28,7 @@ DBImplSecondary::DBImplSecondary(const DBOptions& db_options, LogFlush(immutable_db_options_.info_log); } -DBImplSecondary::~DBImplSecondary() {} +DBImplSecondary::~DBImplSecondary() = default; Status DBImplSecondary::Recover( const std::vector& column_families, @@ -327,6 +327,7 @@ Status DBImplSecondary::RecoverLogFiles( status = *wal_read_status; } if (!status.ok()) { + wal_read_status->PermitUncheckedError(); return status; } } @@ -497,8 +498,8 @@ Iterator* DBImplSecondary::NewIterator(const ReadOptions& _read_options, Iterator* result = nullptr; auto cfh = static_cast_with_check(column_family); + assert(cfh != nullptr); auto cfd = cfh->cfd(); - ReadCallback* read_callback = nullptr; // No read callback provided. if (read_options.tailing) { return NewErrorIterator(Status::NotSupported( "tailing iterator not supported in secondary mode")); @@ -517,27 +518,28 @@ Iterator* DBImplSecondary::NewIterator(const ReadOptions& _read_options, return NewErrorIterator(s); } } - result = NewIteratorImpl(read_options, cfd, sv, snapshot, read_callback); + result = NewIteratorImpl(read_options, cfh, sv, snapshot, + nullptr /*read_callback*/); } return result; } ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( - const ReadOptions& read_options, ColumnFamilyData* cfd, + const ReadOptions& read_options, ColumnFamilyHandleImpl* cfh, SuperVersion* super_version, SequenceNumber snapshot, ReadCallback* read_callback, bool expose_blob_index, bool allow_refresh) { - assert(nullptr != cfd); + assert(nullptr != cfh); assert(snapshot == kMaxSequenceNumber); snapshot = versions_->LastSequence(); assert(snapshot != kMaxSequenceNumber); auto db_iter = NewArenaWrappedDbIterator( - env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, - super_version->current, snapshot, + env_, read_options, *cfh->cfd()->ioptions(), + super_version->mutable_cf_options, super_version->current, snapshot, super_version->mutable_cf_options.max_sequential_skip_in_iterations, - super_version->version_number, read_callback, this, cfd, - expose_blob_index, allow_refresh); + super_version->version_number, read_callback, cfh, expose_blob_index, + allow_refresh); auto internal_iter = NewInternalIterator( - db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(), + db_iter->GetReadOptions(), cfh->cfd(), super_version, db_iter->GetArena(), snapshot, /* allow_unprepared_value */ true, db_iter); db_iter->SetIterUnderDBIter(internal_iter); return db_iter; @@ -596,28 +598,29 @@ Status DBImplSecondary::NewIterators( return Status::NotSupported("snapshot not supported in secondary mode"); } else { SequenceNumber read_seq(kMaxSequenceNumber); - autovector> cfd_to_sv; + autovector> cfh_to_sv; const bool check_read_ts = read_options.timestamp && read_options.timestamp->size() > 0; - for (auto cfh : column_families) { - ColumnFamilyData* cfd = static_cast(cfh)->cfd(); + for (auto cf : column_families) { + auto cfh = static_cast_with_check(cf); + auto cfd = cfh->cfd(); SuperVersion* sv = cfd->GetReferencedSuperVersion(this); - cfd_to_sv.emplace_back(cfd, sv); + cfh_to_sv.emplace_back(cfh, sv); if (check_read_ts) { const Status s = FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp)); if (!s.ok()) { - for (auto prev_entry : cfd_to_sv) { + for (auto prev_entry : cfh_to_sv) { CleanupSuperVersion(std::get<1>(prev_entry)); } return s; } } } - assert(cfd_to_sv.size() == column_families.size()); - for (auto [cfd, sv] : cfd_to_sv) { + assert(cfh_to_sv.size() == column_families.size()); + for (auto [cfh, sv] : cfh_to_sv) { iterators->push_back( - NewIteratorImpl(read_options, cfd, sv, read_seq, read_callback)); + NewIteratorImpl(read_options, cfh, sv, read_seq, read_callback)); } } return Status::OK(); @@ -802,7 +805,7 @@ Status DB::OpenAsSecondary( impl->mutex_.Lock(); s = impl->Recover(column_families, true, false, false); if (s.ok()) { - for (auto cf : column_families) { + for (const auto& cf : column_families) { auto cfd = impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); if (nullptr == cfd) { diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h index 12a8bbdd707..f1a40af3792 100644 --- a/db/db_impl/db_impl_secondary.h +++ b/db/db_impl/db_impl_secondary.h @@ -112,8 +112,8 @@ class DBImplSecondary : public DBImpl { ColumnFamilyHandle* column_family) override; ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options, - ColumnFamilyData* cfd, SuperVersion* sv, - SequenceNumber snapshot, + ColumnFamilyHandleImpl* cfh, + SuperVersion* sv, SequenceNumber snapshot, ReadCallback* read_callback, bool expose_blob_index = false, bool allow_refresh = true); @@ -189,7 +189,7 @@ class DBImplSecondary : public DBImpl { return Status::NotSupported("Not supported operation in secondary mode."); } - Status EnableFileDeletions(bool /*force*/) override { + Status EnableFileDeletions() override { return Status::NotSupported("Not supported operation in secondary mode."); } diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 2f1e93f4d6c..a5674783bf0 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -224,6 +224,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, write_options.protection_bytes_per_key != 8) { return Status::InvalidArgument( "`WriteOptions::protection_bytes_per_key` must be zero or eight"); + } else if (write_options.disableWAL && + immutable_db_options_.recycle_log_file_num > 0) { + return Status::InvalidArgument( + "WriteOptions::disableWAL option is not supported if " + "DBOptions::recycle_log_file_num > 0"); } if (immutable_db_options_.replication_log_listener) { if (immutable_db_options_.unordered_write) { @@ -349,7 +354,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, if (w.ShouldWriteToMemtable()) { PERF_TIMER_STOP(write_pre_and_post_process_time); - PERF_TIMER_GUARD(write_memtable_time); + PERF_TIMER_FOR_WAIT_GUARD(write_memtable_time); ColumnFamilyMemTablesImpl column_family_memtables( versions_->GetColumnFamilySet()); @@ -461,10 +466,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, valid_batches += writer->batch_cnt; if (writer->ShouldWriteToMemtable()) { total_count += WriteBatchInternal::Count(writer->batch); + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); parallel = parallel && !writer->batch->HasMerge(); } - total_byte_size = WriteBatchInternal::AppendedByteSize( - total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); if (writer->pre_release_callback) { pre_release_callback_cnt++; } @@ -603,7 +608,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } if (status.ok()) { - PERF_TIMER_GUARD(write_memtable_time); + PERF_TIMER_FOR_WAIT_GUARD(write_memtable_time); if (!parallel) { // w.sequence will be set inside InsertInto @@ -666,9 +671,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, log_write_mutex_.Unlock(); if (status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; - status = ApplyWALToManifest(read_options, &synced_wals); + status = ApplyWALToManifest(read_options, write_options, &synced_wals); } // Requesting sync with two_write_queues_ is expected to be very rare. We @@ -766,11 +771,11 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, if (writer->ShouldWriteToMemtable()) { writer->sequence = next_sequence; size_t count = WriteBatchInternal::Count(writer->batch); + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); next_sequence += count; total_count += count; } - total_byte_size = WriteBatchInternal::AppendedByteSize( - total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); } } if (w.disable_wal) { @@ -829,9 +834,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, } if (w.status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; - w.status = ApplyWALToManifest(read_options, &synced_wals); + w.status = ApplyWALToManifest(read_options, write_options, &synced_wals); } write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); } @@ -843,7 +848,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, WriteThread::WriteGroup memtable_write_group; if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) { - PERF_TIMER_GUARD(write_memtable_time); + PERF_TIMER_FOR_WAIT_GUARD(write_memtable_time); assert(w.ShouldWriteToMemtable()); write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group); if (memtable_write_group.size > 1 && @@ -865,6 +870,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, } if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { + PERF_TIMER_STOP(write_pre_and_post_process_time); + PERF_TIMER_FOR_WAIT_GUARD(write_memtable_time); + assert(w.ShouldWriteToMemtable()); ColumnFamilyMemTablesImpl column_family_memtables( versions_->GetColumnFamilySet()); @@ -874,6 +882,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/, write_options.memtable_insert_hint_per_batch); + + PERF_TIMER_STOP(write_memtable_time); + PERF_TIMER_START(write_pre_and_post_process_time); + if (write_thread_.CompleteParallelMemTableWriter(&w)) { MemTableInsertStatusCheck(w.status); versions_->SetLastSequence(w.write_group->last_sequence); @@ -906,6 +918,9 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options, stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count); RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + PERF_TIMER_STOP(write_pre_and_post_process_time); + PERF_TIMER_FOR_WAIT_GUARD(write_memtable_time); + ColumnFamilyMemTablesImpl column_family_memtables( versions_->GetColumnFamilySet()); w.status = WriteBatchInternal::InsertInto( @@ -917,6 +932,8 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options, if (write_options.disableWAL) { has_unpersisted_data_.store(true, std::memory_order_relaxed); } + + PERF_TIMER_START(write_pre_and_post_process_time); } size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1; @@ -987,9 +1004,13 @@ Status DBImpl::WriteImplWALOnly( return status; } } else { + PERF_TIMER_STOP(write_pre_and_post_process_time); + PERF_TIMER_FOR_WAIT_GUARD(write_delay_time); InstrumentedMutexLock lock(&mutex_); Status status = DelayWrite(/*num_bytes=*/0ull, *write_thread, write_options); + PERF_TIMER_STOP(write_delay_time); + PERF_TIMER_START(write_pre_and_post_process_time); if (!status.ok()) { WriteThread::WriteGroup write_group; write_thread->EnterAsBatchGroupLeader(&w, &write_group); @@ -1258,7 +1279,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, if (UNLIKELY(status.ok() && (write_controller_.IsStopped() || write_controller_.NeedsDelay()))) { PERF_TIMER_STOP(write_pre_and_post_process_time); - PERF_TIMER_GUARD(write_delay_time); + PERF_TIMER_FOR_WAIT_GUARD(write_delay_time); // We don't know size of curent batch so that we always use the size // for previous one. It might create a fairness issue that expiration // might happen for smaller writes but larger writes can go through. @@ -1364,9 +1385,9 @@ Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group, // When two_write_queues_ is disabled, this function is called from the only // write thread. Otherwise this must be called holding log_write_mutex_. IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch, + const WriteOptions& write_options, log::Writer* log_writer, uint64_t* log_used, uint64_t* log_size, - Env::IOPriority rate_limiter_priority, LogFileNumberSize& log_file_number_size) { assert(log_size != nullptr); @@ -1389,12 +1410,11 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch, log_write_mutex_.Lock(); } IOStatus io_s = log_writer->MaybeAddUserDefinedTimestampSizeRecord( - versions_->GetColumnFamiliesTimestampSizeForRecord(), - rate_limiter_priority); + write_options, versions_->GetColumnFamiliesTimestampSizeForRecord()); if (!io_s.ok()) { return io_s; } - io_s = log_writer->AddRecord(log_entry, rate_limiter_priority); + io_s = log_writer->AddRecord(write_options, log_entry); if (UNLIKELY(needs_locking)) { log_write_mutex_.Unlock(); @@ -1437,9 +1457,13 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, WriteBatchInternal::SetSequence(merged_batch, sequence); uint64_t log_size; - io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size, - write_group.leader->rate_limiter_priority, - log_file_number_size); + + // TODO: plumb Env::IOActivity, Env::IOPriority + WriteOptions write_options; + write_options.rate_limiter_priority = + write_group.leader->rate_limiter_priority; + io_s = WriteToWAL(*merged_batch, write_options, log_writer, log_used, + &log_size, log_file_number_size); if (to_be_cached_state) { cached_recoverable_state_ = *to_be_cached_state; cached_recoverable_state_empty_ = false; @@ -1466,10 +1490,17 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, log_write_mutex_.Lock(); } - for (auto& log : logs_) { - io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync); - if (!io_s.ok()) { - break; + if (io_s.ok()) { + for (auto& log : logs_) { + IOOptions opts; + io_s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (!io_s.ok()) { + break; + } + io_s = log.writer->file()->Sync(opts, immutable_db_options_.use_fsync); + if (!io_s.ok()) { + break; + } } } @@ -1542,9 +1573,13 @@ IOStatus DBImpl::ConcurrentWriteToWAL( assert(log_writer->get_log_number() == log_file_number_size.number); uint64_t log_size; - io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size, - write_group.leader->rate_limiter_priority, - log_file_number_size); + + // TODO: plumb Env::IOActivity, Env::IOPriority + WriteOptions write_options; + write_options.rate_limiter_priority = + write_group.leader->rate_limiter_priority; + io_s = WriteToWAL(*merged_batch, write_options, log_writer, log_used, + &log_size, log_file_number_size); if (to_be_cached_state) { cached_recoverable_state_ = *to_be_cached_state; cached_recoverable_state_empty_ = false; @@ -1609,6 +1644,8 @@ Status DBImpl::WriteRecoverableState() { if (status.ok()) { cached_recoverable_state_.Clear(); cached_recoverable_state_empty_ = true; + } else { + // FIXME: !ok status is untested } return status; } @@ -1907,11 +1944,12 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, WriteThread& write_thread, delay = 0; } TEST_SYNC_POINT("DBImpl::DelayWrite:Start"); + start_time = immutable_db_options_.clock->NowMicros(); + if (delay > 0) { if (write_options.no_slowdown) { return Status::Incomplete("Write stall"); } - start_time = immutable_db_options_.clock->NowMicros(); TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep"); // Notify write_thread about the stall so it can setup a barrier and @@ -2035,7 +2073,7 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, // is that in case the write is heavy, low pri writes may never have // a chance to run. Now we guarantee we are still slowly making // progress. - PERF_TIMER_GUARD(write_delay_time); + PERF_TIMER_FOR_WAIT_GUARD(write_delay_time); auto data_size = my_batch->GetDataSize(); while (data_size > 0) { size_t allowed = write_controller_.low_pri_rate_limiter()->RequestToken( @@ -2145,8 +2183,10 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) { mem_switch_record); } + TEST_SYNC_POINT_CALLBACK("DBImpl::ScheduleFlushes:PreSwitchMemtable", + nullptr); for (auto& cfd : cfds) { - if (!cfd->mem()->IsEmpty()) { + if (status.ok() && !cfd->mem()->IsEmpty()) { if (immutable_db_options_.replication_log_listener) { status = SwitchMemtableWithoutCreatingWAL( cfd, context, mem_switch_record.next_log_num, replication_sequence); @@ -2157,9 +2197,6 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) { if (cfd->UnrefAndTryDelete()) { cfd = nullptr; } - if (!status.ok()) { - break; - } } if (two_write_queues_) { @@ -2194,7 +2231,7 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/, } mutex_.Unlock(); - for (auto listener : immutable_db_options_.listeners) { + for (const auto& listener : immutable_db_options_.listeners) { listener->OnMemTableSealed(mem_table_info); } mutex_.Lock(); @@ -2273,8 +2310,10 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { // replication_log_listener is set assert(!immutable_db_options_.replication_log_listener); mutex_.AssertHeld(); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + log::Writer* new_log = nullptr; MemTable* new_mem = nullptr; IOStatus io_s; @@ -2297,8 +2336,10 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { log_write_mutex_.Unlock(); } uint64_t recycle_log_number = 0; + // If file deletion is disabled, don't recycle logs since it'll result in + // the file getting renamed if (creating_new_log && immutable_db_options_.recycle_log_file_num && - !log_recycle_files_.empty()) { + !log_recycle_files_.empty() && IsFileDeletionsEnabled()) { recycle_log_number = log_recycle_files_.front(); } uint64_t new_log_number = @@ -2321,8 +2362,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { if (creating_new_log) { // TODO: Write buffer size passed in should be max of all CF's instead // of mutable_cf_options.write_buffer_size. - io_s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size, - &new_log); + io_s = CreateWAL(write_options, new_log_number, recycle_log_number, + preallocate_block_size, &new_log); if (s.ok()) { s = io_s; } @@ -2359,7 +2400,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { // In recovery path, we force another try of writing WAL buffer. cur_log_writer->file()->reset_seen_error(); } - io_s = cur_log_writer->WriteBuffer(); + io_s = cur_log_writer->WriteBuffer(write_options); if (s.ok()) { s = io_s; } @@ -2376,7 +2417,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { log_empty_ = true; log_dir_synced_ = false; logs_.emplace_back(logfile_number_, new_log); - alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); + alive_log_files_.emplace_back(logfile_number_); } } @@ -2427,7 +2468,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { VersionEdit wal_deletion; wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); s = versions_->LogAndApplyToDefaultColumnFamily( - read_options, &wal_deletion, &mutex_, directories_.GetDbDir()); + read_options, write_options, &wal_deletion, &mutex_, + directories_.GetDbDir()); if (!s.ok() && versions_->io_status().IsIOError()) { s = error_handler_.SetBGError(versions_->io_status(), BackgroundErrorReason::kManifestWrite); diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc index 7dd64795513..e454897125f 100644 --- a/db/db_info_dumper.cc +++ b/db/db_info_dumper.cc @@ -5,10 +5,9 @@ #include "db/db_info_dumper.h" -#include - #include #include +#include #include #include @@ -99,7 +98,12 @@ void DumpDBFileSummary(const ImmutableDBOptions& options, for (auto& db_path : options.db_paths) { if (dbname.compare(db_path.path) != 0) { s = env->GetChildren(db_path.path, &files); - if (!s.ok()) { + if (s.IsNotFound() || s.IsPathNotFound()) { + Header(options.info_log, + "Directory from db_paths/cf_paths does not yet exist: %s\n", + db_path.path.c_str()); + continue; + } else if (!s.ok()) { Error(options.info_log, "Error when reading %s dir %s\n", db_path.path.c_str(), s.ToString().c_str()); continue; @@ -122,12 +126,18 @@ void DumpDBFileSummary(const ImmutableDBOptions& options, // Get wal file in wal_dir const auto& wal_dir = options.GetWalDir(dbname); + bool log_wal_info = true; if (!options.IsWalDirSameAsDBPath(dbname)) { s = env->GetChildren(wal_dir, &files); - if (!s.ok()) { - Error(options.info_log, "Error when reading %s dir %s\n", wal_dir.c_str(), - s.ToString().c_str()); - return; + if (s.IsNotFound() || s.IsPathNotFound()) { + Header(options.info_log, + "Write Ahead Log directory does not yet exist: %s\n", + wal_dir.c_str()); + log_wal_info = false; + } else if (!s.ok()) { + Error(options.info_log, "Error when reading wal dir %s: %s\n", + wal_dir.c_str(), s.ToString().c_str()); + log_wal_info = false; } wal_info.clear(); for (const std::string& file : files) { @@ -147,7 +157,9 @@ void DumpDBFileSummary(const ImmutableDBOptions& options, } } } - Header(options.info_log, "Write Ahead Log file in %s: %s\n", wal_dir.c_str(), - wal_info.c_str()); + if (log_wal_info) { + Header(options.info_log, "Write Ahead Log file in %s: %s\n", + wal_dir.c_str(), wal_info.c_str()); + } } } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_io_failure_test.cc b/db/db_io_failure_test.cc index e79272ea7ec..32e41e83ffe 100644 --- a/db/db_io_failure_test.cc +++ b/db/db_io_failure_test.cc @@ -13,6 +13,120 @@ #include "util/random.h" namespace ROCKSDB_NAMESPACE { +namespace { +// A wrapper that allows injection of errors. +class CorruptionFS : public FileSystemWrapper { + public: + bool writable_file_error_; + int num_writable_file_errors_; + + explicit CorruptionFS(const std::shared_ptr& _target, + bool fs_buffer) + : FileSystemWrapper(_target), + writable_file_error_(false), + num_writable_file_errors_(0), + corruption_trigger_(INT_MAX), + read_count_(0), + rnd_(300), + fs_buffer_(fs_buffer) {} + ~CorruptionFS() override { + // Assert that the corruption was reset, which means it got triggered + assert(corruption_trigger_ == INT_MAX); + } + const char* Name() const override { return "ErrorEnv"; } + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + result->reset(); + if (writable_file_error_) { + ++num_writable_file_errors_; + return IOStatus::IOError(fname, "fake error"); + } + return target()->NewWritableFile(fname, opts, result, dbg); + } + + void SetCorruptionTrigger(const int trigger) { + corruption_trigger_ = trigger; + read_count_ = 0; + } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + class CorruptionRandomAccessFile : public FSRandomAccessFileOwnerWrapper { + public: + CorruptionRandomAccessFile(CorruptionFS& fs, + std::unique_ptr& file) + : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {} + + IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + IOStatus s = target()->Read(offset, len, opts, result, scratch, dbg); + if (opts.verify_and_reconstruct_read) { + return s; + } + if (s.ok() && ++fs_.read_count_ >= fs_.corruption_trigger_) { + fs_.read_count_ = 0; + fs_.corruption_trigger_ = INT_MAX; + char* data = const_cast(result->data()); + std::memcpy( + data, + fs_.rnd_.RandomString(static_cast(result->size())).c_str(), + result->size()); + } + return s; + } + + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) override { + for (size_t i = 0; i < num_reqs; ++i) { + FSReadRequest& req = reqs[i]; + if (fs_.fs_buffer_) { + FSAllocationPtr buffer(new char[req.len], [](void* ptr) { + delete[] static_cast(ptr); + }); + req.fs_scratch = std::move(buffer); + req.status = Read(req.offset, req.len, options, &req.result, + static_cast(req.fs_scratch.get()), dbg); + } else { + req.status = Read(req.offset, req.len, options, &req.result, + req.scratch, dbg); + } + } + return IOStatus::OK(); + } + + private: + CorruptionFS& fs_; + }; + + std::unique_ptr file; + IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg); + EXPECT_OK(s); + result->reset(new CorruptionRandomAccessFile(*this, file)); + + return s; + } + + void SupportedOps(int64_t& supported_ops) override { + supported_ops = 1 << FSSupportedOps::kVerifyAndReconstructRead | + 1 << FSSupportedOps::kAsyncIO; + if (fs_buffer_) { + supported_ops |= 1 << FSSupportedOps::kFSBuffer; + } + } + + private: + int corruption_trigger_; + int read_count_; + Random rnd_; + bool fs_buffer_; +}; +} // anonymous namespace class DBIOFailureTest : public DBTestBase { public: @@ -579,6 +693,132 @@ TEST_F(DBIOFailureTest, CompactionSstSyncError) { ASSERT_EQ("bar3", Get(1, "foo")); } #endif // !(defined NDEBUG) || !defined(OS_WIN) + +class DBIOCorruptionTest + : public DBIOFailureTest, + public testing::WithParamInterface> { + public: + DBIOCorruptionTest() : DBIOFailureTest() { + BlockBasedTableOptions bbto; + Options options = CurrentOptions(); + + base_env_ = env_; + EXPECT_NE(base_env_, nullptr); + fs_.reset( + new CorruptionFS(base_env_->GetFileSystem(), std::get<0>(GetParam()))); + env_guard_ = NewCompositeEnv(fs_); + options.env = env_guard_.get(); + bbto.num_file_reads_for_auto_readahead = 0; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.disable_auto_compactions = true; + + Reopen(options); + } + + ~DBIOCorruptionTest() { + Close(); + db_ = nullptr; + } + + protected: + std::unique_ptr env_guard_; + std::shared_ptr fs_; + Env* base_env_; +}; + +TEST_P(DBIOCorruptionTest, GetReadCorruptionRetry) { + CorruptionFS* fs = + static_cast(env_guard_->GetFileSystem().get()); + + ASSERT_OK(Put("key1", "val1")); + ASSERT_OK(Flush()); + fs->SetCorruptionTrigger(1); + + std::string val; + ReadOptions ro; + ro.async_io = std::get<1>(GetParam()); + ASSERT_OK(dbfull()->Get(ReadOptions(), "key1", &val)); + ASSERT_EQ(val, "val1"); +} + +TEST_P(DBIOCorruptionTest, IterReadCorruptionRetry) { + CorruptionFS* fs = + static_cast(env_guard_->GetFileSystem().get()); + + ASSERT_OK(Put("key1", "val1")); + ASSERT_OK(Flush()); + fs->SetCorruptionTrigger(1); + + ReadOptions ro; + ro.readahead_size = 65536; + ro.async_io = std::get<1>(GetParam()); + + Iterator* iter = dbfull()->NewIterator(ro); + iter->SeekToFirst(); + while (iter->status().ok() && iter->Valid()) { + iter->Next(); + } + ASSERT_OK(iter->status()); + delete iter; +} + +TEST_P(DBIOCorruptionTest, MultiGetReadCorruptionRetry) { + CorruptionFS* fs = + static_cast(env_guard_->GetFileSystem().get()); + + ASSERT_OK(Put("key1", "val1")); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); + fs->SetCorruptionTrigger(1); + + std::vector keystr{"key1", "key2"}; + std::vector keys{Slice(keystr[0]), Slice(keystr[1])}; + std::vector values(keys.size()); + std::vector statuses(keys.size()); + ReadOptions ro; + ro.async_io = std::get<1>(GetParam()); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + ASSERT_EQ(values[0].ToString(), "val1"); + ASSERT_EQ(values[1].ToString(), "val2"); +} + +TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) { + CorruptionFS* fs = + static_cast(env_guard_->GetFileSystem().get()); + + ASSERT_OK(Put("key1", "val1")); + ASSERT_OK(Put("key3", "val3")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); + fs->SetCorruptionTrigger(1); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + std::string val; + ReadOptions ro; + ro.async_io = std::get<1>(GetParam()); + ASSERT_OK(dbfull()->Get(ro, "key1", &val)); + ASSERT_EQ(val, "val1"); +} + +TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) { + CorruptionFS* fs = + static_cast(env_guard_->GetFileSystem().get()); + + ASSERT_OK(Put("key1", "val1")); + fs->SetCorruptionTrigger(1); + ASSERT_OK(Flush()); + + std::string val; + ReadOptions ro; + ro.async_io = std::get<1>(GetParam()); + ASSERT_OK(dbfull()->Get(ro, "key1", &val)); + ASSERT_EQ(val, "val1"); +} + +INSTANTIATE_TEST_CASE_P(DBIOCorruptionTest, DBIOCorruptionTest, + testing::Combine(testing::Bool(), testing::Bool())); } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_iter.cc b/db/db_iter.cc index 418c538d437..65e2fc1654b 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -43,8 +43,8 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, const Comparator* cmp, InternalIterator* iter, const Version* version, SequenceNumber s, bool arena_mode, uint64_t max_sequential_skip_in_iterations, - ReadCallback* read_callback, DBImpl* db_impl, - ColumnFamilyData* cfd, bool expose_blob_index) + ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh, + bool expose_blob_index) : prefix_extractor_(mutable_cf_options.prefix_extractor.get()), env_(_env), clock_(ioptions.clock), @@ -79,12 +79,10 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, is_blob_(false), arena_mode_(arena_mode), io_activity_(read_options.io_activity), - db_impl_(db_impl), - cfd_(cfd), + cfh_(cfh), timestamp_ub_(read_options.timestamp), timestamp_lb_(read_options.iter_start_ts), - timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0), - auto_readahead_size_(read_options.auto_readahead_size) { + timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) { RecordTick(statistics_, NO_ITERATOR_CREATED); if (pin_thru_lifetime_) { pinned_iters_mgr_.StartPinning(); @@ -115,8 +113,8 @@ Status DBIter::GetProperty(std::string prop_name, std::string* prop) { *prop = saved_key_.GetUserKey().ToString(); return Status::OK(); } else if (prop_name == "rocksdb.iterator.write-time") { - // TODO(yuzhangyu): implement return the actual write time. - return Status::NotSupported("write time property is under construction"); + PutFixed64(prop, saved_write_unix_time_); + return Status::OK(); } return Status::InvalidArgument("Unidentified property."); } @@ -202,6 +200,7 @@ bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to // avoid having to copy options back and forth. + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions read_options; read_options.read_tier = read_tier_; read_options.fill_cache = fill_cache_; @@ -393,11 +392,10 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, } break; case kTypeValue: + case kTypeValuePreferredSeqno: case kTypeBlobIndex: case kTypeWideColumnEntity: - if (!iter_.PrepareValue()) { - assert(!iter_.status().ok()); - valid_ = false; + if (!PrepareValue()) { return false; } if (timestamp_lb_) { @@ -420,17 +418,21 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, return false; } } else { - assert(ikey_.type == kTypeValue); - SetValueAndColumnsFromPlain(iter_.value()); + assert(ikey_.type == kTypeValue || + ikey_.type == kTypeValuePreferredSeqno); + Slice value = iter_.value(); + saved_write_unix_time_ = iter_.write_unix_time(); + if (ikey_.type == kTypeValuePreferredSeqno) { + value = ParsePackedValueForValue(value); + } + SetValueAndColumnsFromPlain(value); } valid_ = true; return true; break; case kTypeMerge: - if (!iter_.PrepareValue()) { - assert(!iter_.status().ok()); - valid_ = false; + if (!PrepareValue()) { return false; } saved_key_.SetUserKey( @@ -575,15 +577,19 @@ bool DBIter::MergeValuesNewToOld() { iter_.Next(); break; } - if (!iter_.PrepareValue()) { - valid_ = false; + if (!PrepareValue()) { return false; } - if (kTypeValue == ikey.type) { - // hit a put, merge the put value with operands and store the - // final result in saved_value_. We are done! - if (!MergeWithPlainBaseValue(iter_.value(), ikey.user_key)) { + if (kTypeValue == ikey.type || kTypeValuePreferredSeqno == ikey.type) { + Slice value = iter_.value(); + saved_write_unix_time_ = iter_.write_unix_time(); + if (kTypeValuePreferredSeqno == ikey.type) { + value = ParsePackedValueForValue(value); + } + // hit a put or put equivalent, merge the put value with operands and + // store the final result in saved_value_. We are done! + if (!MergeWithPlainBaseValue(value, ikey.user_key)) { return false; } // iter_ is positioned after put @@ -706,16 +712,21 @@ bool DBIter::ReverseToForward() { // not exist or may have different prefix than the current key(). // If that's the case, seek iter_ to current key. if (!expect_total_order_inner_iter() || !iter_.Valid()) { - IterKey last_key; - ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber, - kValueTypeForSeek); - if (timestamp_size_ > 0) { + std::string last_key; + if (timestamp_size_ == 0) { + AppendInternalKey( + &last_key, ParsedInternalKey(saved_key_.GetUserKey(), + kMaxSequenceNumber, kValueTypeForSeek)); + } else { // TODO: pre-create kTsMax. const std::string kTsMax(timestamp_size_, '\xff'); - pikey.SetTimestamp(kTsMax); + AppendInternalKeyWithDifferentTimestamp( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), kMaxSequenceNumber, + kValueTypeForSeek), + kTsMax); } - last_key.SetInternalKey(pikey); - iter_.Seek(last_key.GetInternalKey()); + iter_.Seek(last_key); RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); } @@ -747,22 +758,15 @@ bool DBIter::ReverseToBackward() { // When current_entry_is_merged_ is true, iter_ may be positioned on the next // key, which may not exist or may have prefix different from current. // If that's the case, seek to saved_key_. - // - // In case of auto_readahead_size enabled, index_iter moves forward during - // forward scan for block cache lookup and points to different block. If Prev - // op is called, it needs to call SeekForPrev to point to right index_iter_ in - // BlockBasedTableIterator. This only happens when direction is changed from - // forward to backward. - if ((current_entry_is_merged_ && - (!expect_total_order_inner_iter() || !iter_.Valid())) || - auto_readahead_size_) { + if (current_entry_is_merged_ && + (!expect_total_order_inner_iter() || !iter_.Valid())) { IterKey last_key; // Using kMaxSequenceNumber and kValueTypeForSeek // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller // than saved_key_. last_key.SetInternalKey(ParsedInternalKey( saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); - if (!expect_total_order_inner_iter() || auto_readahead_size_) { + if (!expect_total_order_inner_iter()) { iter_.SeekForPrev(last_key.GetInternalKey()); } else { // Some iterators may not support SeekForPrev(), so we avoid using it @@ -852,8 +856,8 @@ bool DBIter::FindValueForCurrentKey() { merge_context_.Clear(); current_entry_is_merged_ = false; // last entry before merge (could be kTypeDeletion, - // kTypeDeletionWithTimestamp, kTypeSingleDeletion, kTypeValue, - // kTypeBlobIndex, or kTypeWideColumnEntity) + // kTypeDeletionWithTimestamp, kTypeSingleDeletion, kTypeValue + // kTypeBlobIndex, kTypeWideColumnEntity or kTypeValuePreferredSeqno) ValueType last_not_merge_type = kTypeDeletion; ValueType last_key_entry_type = kTypeDeletion; @@ -909,8 +913,7 @@ bool DBIter::FindValueForCurrentKey() { return FindValueForCurrentKeyUsingSeek(); } - if (!iter_.PrepareValue()) { - valid_ = false; + if (!PrepareValue()) { return false; } @@ -931,10 +934,16 @@ bool DBIter::FindValueForCurrentKey() { last_key_entry_type = ikey.type; switch (last_key_entry_type) { case kTypeValue: + case kTypeValuePreferredSeqno: case kTypeBlobIndex: case kTypeWideColumnEntity: if (iter_.iter()->IsValuePinned()) { - pinned_value_ = iter_.value(); + saved_write_unix_time_ = iter_.write_unix_time(); + if (last_key_entry_type == kTypeValuePreferredSeqno) { + pinned_value_ = ParsePackedValueForValue(iter_.value()); + } else { + pinned_value_ = iter_.value(); + } } else { valid_ = false; status_ = Status::NotSupported( @@ -1044,7 +1053,8 @@ bool DBIter::FindValueForCurrentKey() { return true; } else { - assert(last_not_merge_type == kTypeValue); + assert(last_not_merge_type == kTypeValue || + last_not_merge_type == kTypeValuePreferredSeqno); if (!MergeWithPlainBaseValue(pinned_value_, saved_key_.GetUserKey())) { return false; } @@ -1052,6 +1062,7 @@ bool DBIter::FindValueForCurrentKey() { } break; case kTypeValue: + case kTypeValuePreferredSeqno: SetValueAndColumnsFromPlain(pinned_value_); break; @@ -1149,18 +1160,22 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { } return true; } - if (!iter_.PrepareValue()) { - valid_ = false; + if (!PrepareValue()) { return false; } if (timestamp_size_ > 0) { Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_); saved_timestamp_.assign(ts.data(), ts.size()); } - if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex || - ikey.type == kTypeWideColumnEntity) { + if (ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno || + ikey.type == kTypeBlobIndex || ikey.type == kTypeWideColumnEntity) { assert(iter_.iter()->IsValuePinned()); - pinned_value_ = iter_.value(); + saved_write_unix_time_ = iter_.write_unix_time(); + if (ikey.type == kTypeValuePreferredSeqno) { + pinned_value_ = ParsePackedValueForValue(iter_.value()); + } else { + pinned_value_ = iter_.value(); + } if (ikey.type == kTypeBlobIndex) { if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) { return false; @@ -1173,7 +1188,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { return false; } } else { - assert(ikey.type == kTypeValue); + assert(ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno); SetValueAndColumnsFromPlain(pinned_value_); } @@ -1215,13 +1230,16 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { ikey.type == kTypeDeletionWithTimestamp) { break; } - if (!iter_.PrepareValue()) { - valid_ = false; + if (!PrepareValue()) { return false; } - if (ikey.type == kTypeValue) { - if (!MergeWithPlainBaseValue(iter_.value(), saved_key_.GetUserKey())) { + if (ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno) { + Slice value = iter_.value(); + if (ikey.type == kTypeValuePreferredSeqno) { + value = ParsePackedValueForValue(value); + } + if (!MergeWithPlainBaseValue(value, saved_key_.GetUserKey())) { return false; } return true; @@ -1293,8 +1311,8 @@ bool DBIter::MergeWithNoBaseValue(const Slice& user_key) { const Status s = MergeHelper::TimedFullMerge( merge_operator_, user_key, MergeHelper::kNoBaseValue, merge_context_.GetOperands(), logger_, statistics_, clock_, - /* update_num_ops_stats */ true, &saved_value_, &pinned_value_, - &result_type, /* op_failure_scope */ nullptr); + /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr, + &saved_value_, &pinned_value_, &result_type); return SetValueAndColumnsFromMergeResult(s, result_type); } @@ -1306,8 +1324,8 @@ bool DBIter::MergeWithPlainBaseValue(const Slice& value, const Status s = MergeHelper::TimedFullMerge( merge_operator_, user_key, MergeHelper::kPlainBaseValue, value, merge_context_.GetOperands(), logger_, statistics_, clock_, - /* update_num_ops_stats */ true, &saved_value_, &pinned_value_, - &result_type, /* op_failure_scope */ nullptr); + /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr, + &saved_value_, &pinned_value_, &result_type); return SetValueAndColumnsFromMergeResult(s, result_type); } @@ -1319,8 +1337,8 @@ bool DBIter::MergeWithWideColumnBaseValue(const Slice& entity, const Status s = MergeHelper::TimedFullMerge( merge_operator_, user_key, MergeHelper::kWideBaseValue, entity, merge_context_.GetOperands(), logger_, statistics_, clock_, - /* update_num_ops_stats */ true, &saved_value_, &pinned_value_, - &result_type, /* op_failure_scope */ nullptr); + /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr, + &saved_value_, &pinned_value_, &result_type); return SetValueAndColumnsFromMergeResult(s, result_type); } @@ -1358,18 +1376,23 @@ bool DBIter::FindUserKeyBeforeSavedKey() { if (num_skipped >= max_skip_) { num_skipped = 0; - IterKey last_key; - ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber, - kValueTypeForSeek); - if (timestamp_size_ > 0) { + std::string last_key; + if (timestamp_size_ == 0) { + AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(), + kMaxSequenceNumber, + kValueTypeForSeek)); + } else { // TODO: pre-create kTsMax. const std::string kTsMax(timestamp_size_, '\xff'); - pikey.SetTimestamp(kTsMax); + AppendInternalKeyWithDifferentTimestamp( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), kMaxSequenceNumber, + kValueTypeForSeek), + kTsMax); } - last_key.SetInternalKey(pikey); // It would be more efficient to use SeekForPrev() here, but some // iterators may not support it. - iter_.Seek(last_key.GetInternalKey()); + iter_.Seek(last_key); RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); if (!iter_.Valid()) { break; @@ -1474,7 +1497,7 @@ void DBIter::Seek(const Slice& target) { PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); StopWatch sw(clock_, statistics_, DB_SEEK); - if (db_impl_ != nullptr && cfd_ != nullptr) { + if (cfh_ != nullptr) { // TODO: What do we do if this returns an error? Slice lower_bound, upper_bound; if (iterate_lower_bound_ != nullptr) { @@ -1487,7 +1510,9 @@ void DBIter::Seek(const Slice& target) { } else { upper_bound = Slice(""); } - db_impl_->TraceIteratorSeek(cfd_->GetID(), target, lower_bound, upper_bound) + cfh_->db() + ->TraceIteratorSeek(cfh_->cfd()->GetID(), target, lower_bound, + upper_bound) .PermitUncheckedError(); } @@ -1548,7 +1573,7 @@ void DBIter::SeekForPrev(const Slice& target) { PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); StopWatch sw(clock_, statistics_, DB_SEEK); - if (db_impl_ != nullptr && cfd_ != nullptr) { + if (cfh_ != nullptr) { // TODO: What do we do if this returns an error? Slice lower_bound, upper_bound; if (iterate_lower_bound_ != nullptr) { @@ -1561,8 +1586,8 @@ void DBIter::SeekForPrev(const Slice& target) { } else { upper_bound = Slice(""); } - db_impl_ - ->TraceIteratorSeekForPrev(cfd_->GetID(), target, lower_bound, + cfh_->db() + ->TraceIteratorSeekForPrev(cfh_->cfd()->GetID(), target, lower_bound, upper_bound) .PermitUncheckedError(); } @@ -1726,13 +1751,12 @@ Iterator* NewDBIterator(Env* env, const ReadOptions& read_options, InternalIterator* internal_iter, const Version* version, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, - ReadCallback* read_callback, DBImpl* db_impl, - ColumnFamilyData* cfd, bool expose_blob_index) { - DBIter* db_iter = - new DBIter(env, read_options, ioptions, mutable_cf_options, - user_key_comparator, internal_iter, version, sequence, false, - max_sequential_skip_in_iterations, read_callback, db_impl, cfd, - expose_blob_index); + ReadCallback* read_callback, + ColumnFamilyHandleImpl* cfh, bool expose_blob_index) { + DBIter* db_iter = new DBIter( + env, read_options, ioptions, mutable_cf_options, user_key_comparator, + internal_iter, version, sequence, false, + max_sequential_skip_in_iterations, read_callback, cfh, expose_blob_index); return db_iter; } diff --git a/db/db_iter.h b/db/db_iter.h index d18bf019c4b..44276719a1b 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -118,7 +118,7 @@ class DBIter final : public Iterator { const MutableCFOptions& mutable_cf_options, const Comparator* cmp, InternalIterator* iter, const Version* version, SequenceNumber s, bool arena_mode, uint64_t max_sequential_skip_in_iterations, - ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh, bool expose_blob_index); // No copying allowed @@ -126,6 +126,10 @@ class DBIter final : public Iterator { void operator=(const DBIter&) = delete; ~DBIter() override { + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_UNKNOWN); // Release pinned data if any if (pinned_iters_mgr_.PinningEnabled()) { pinned_iters_mgr_.ReleasePinnedData(); @@ -134,6 +138,7 @@ class DBIter final : public Iterator { ResetInternalKeysSkippedCounter(); local_stats_.BumpGlobalStatistics(statistics_); iter_.DeleteIter(arena_mode_); + ThreadStatusUtil::SetThreadOperation(cur_op_type); } void SetIter(InternalIterator* iter) { assert(iter_.iter() == nullptr); @@ -330,6 +335,22 @@ class DBIter final : public Iterator { bool MergeWithPlainBaseValue(const Slice& value, const Slice& user_key); bool MergeWithWideColumnBaseValue(const Slice& entity, const Slice& user_key); + bool PrepareValue() { + if (!iter_.PrepareValue()) { + assert(!iter_.status().ok()); + valid_ = false; + return false; + } + // ikey_ could change as BlockBasedTableIterator does Block cache + // lookup and index_iter_ could point to different block resulting + // in ikey_ pointing to wrong key. So ikey_ needs to be updated in + // case of Seek/Next calls to point to right key again. + if (!ParseKey(&ikey_)) { + return false; + } + return true; + } + const SliceTransform* prefix_extractor_; Env* const env_; SystemClock* clock_; @@ -348,6 +369,12 @@ class DBIter final : public Iterator { // and should not be used across functions. Reusing this object can reduce // overhead of calling construction of the function if creating it each time. ParsedInternalKey ikey_; + + // The approximate write time for the entry. It is deduced from the entry's + // sequence number if the seqno to time mapping is available. For a + // kTypeValuePreferredSeqno entry, this is the write time specified by the + // user. + uint64_t saved_write_unix_time_; std::string saved_value_; Slice pinned_value_; // for prefix seek mode to support prev() @@ -398,25 +425,22 @@ class DBIter final : public Iterator { MergeContext merge_context_; LocalStatistics local_stats_; PinnedIteratorsManager pinned_iters_mgr_; - DBImpl* db_impl_; - ColumnFamilyData* cfd_; + ColumnFamilyHandleImpl* cfh_; const Slice* const timestamp_ub_; const Slice* const timestamp_lb_; const size_t timestamp_size_; std::string saved_timestamp_; - bool auto_readahead_size_; }; // Return a new iterator that converts internal keys (yielded by // "*internal_iter") that were live at the specified `sequence` number // into appropriate user keys. -extern Iterator* NewDBIterator( +Iterator* NewDBIterator( Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Comparator* user_key_comparator, InternalIterator* internal_iter, const Version* version, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, - DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr, - bool expose_blob_index = false); + ColumnFamilyHandleImpl* cfh = nullptr, bool expose_blob_index = false); } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc index 872f7e6bd93..daecbcc7acb 100644 --- a/db/db_iter_stress_test.cc +++ b/db/db_iter_stress_test.cc @@ -50,7 +50,9 @@ struct Entry { bool visible = true; bool operator<(const Entry& e) const { - if (key != e.key) return key < e.key; + if (key != e.key) { + return key < e.key; + } return std::tie(sequence, type) > std::tie(e.sequence, e.type); } }; @@ -177,7 +179,9 @@ struct StressTestIterator : public InternalIterator { } void SeekToFirst() override { - if (MaybeFail()) return; + if (MaybeFail()) { + return; + } MaybeMutate(); status_ = Status::OK(); @@ -185,7 +189,9 @@ struct StressTestIterator : public InternalIterator { SkipForward(); } void SeekToLast() override { - if (MaybeFail()) return; + if (MaybeFail()) { + return; + } MaybeMutate(); status_ = Status::OK(); @@ -194,7 +200,9 @@ struct StressTestIterator : public InternalIterator { } void Seek(const Slice& target) override { - if (MaybeFail()) return; + if (MaybeFail()) { + return; + } MaybeMutate(); status_ = Status::OK(); @@ -206,7 +214,9 @@ struct StressTestIterator : public InternalIterator { SkipForward(); } void SeekForPrev(const Slice& target) override { - if (MaybeFail()) return; + if (MaybeFail()) { + return; + } MaybeMutate(); status_ = Status::OK(); @@ -221,14 +231,18 @@ struct StressTestIterator : public InternalIterator { void Next() override { assert(Valid()); - if (MaybeFail()) return; + if (MaybeFail()) { + return; + } MaybeMutate(); ++iter; SkipForward(); } void Prev() override { assert(Valid()); - if (MaybeFail()) return; + if (MaybeFail()) { + return; + } MaybeMutate(); --iter; SkipBackward(); @@ -318,7 +332,9 @@ struct ReferenceIterator { return false; } assert(e.sequence <= sequence); - if (!e.visible) continue; + if (!e.visible) { + continue; + } if (e.type == kTypeDeletion) { return false; } @@ -339,11 +355,13 @@ struct ReferenceIterator { break; } assert(e.sequence <= sequence); - if (!e.visible) continue; + if (!e.visible) { + continue; + } if (e.type == kTypeDeletion) { break; } - operands.push_back(e.value); + operands.emplace_back(e.value); if (e.type == kTypeValue) { break; } @@ -588,15 +606,17 @@ TEST_F(DBIteratorStressTest, StressTest) { // Check that the key moved in the right direction. if (forward) { - if (seek) + if (seek) { ASSERT_GE(db_iter->key().ToString(), old_key); - else + } else { ASSERT_GT(db_iter->key().ToString(), old_key); + } } else { - if (seek) + if (seek) { ASSERT_LE(db_iter->key().ToString(), old_key); - else + } else { ASSERT_LT(db_iter->key().ToString(), old_key); + } } if (ref_iter->Valid()) { diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index 6fd4469700b..cf8321808f9 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -45,6 +45,14 @@ class TestIterator : public InternalIterator { Add(argkey, kTypeValue, argvalue); } + void AddTimedPut(std::string argkey, std::string argvalue, + uint64_t write_unix_time) { + std::string packed_value; + [[maybe_unused]] auto packed_value_slice = + PackValueAndWriteTime(argvalue, write_unix_time, &packed_value); + Add(argkey, kTypeValuePreferredSeqno, packed_value); + } + void AddDeletion(std::string argkey) { Add(argkey, kTypeDeletion, std::string()); } @@ -65,8 +73,7 @@ class TestIterator : public InternalIterator { size_t seq_num, bool update_iter = false) { valid_ = true; ParsedInternalKey internal_key(argkey, seq_num, type); - data_.push_back( - std::pair(std::string(), argvalue)); + data_.emplace_back(std::string(), argvalue); AppendInternalKey(&data_.back().first, internal_key); if (update_iter && valid_ && cmp.Compare(data_.back().first, key()) < 0) { // insert a key smaller than current key @@ -1389,6 +1396,60 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { } } +TEST_F(DBIteratorTest, DBIteratorTimedPutBasic) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddTimedPut("a", "0", /*write_unix_time=*/0); + internal_iter->AddMerge("a", "1"); + internal_iter->AddTimedPut("b", "0", /*write_unix_time=*/0); + internal_iter->AddDeletion("b"); + internal_iter->AddTimedPut("c", "01", /*write_unix_time=*/0); + internal_iter->AddTimedPut("c", "02", /*write_unix_time=*/0); + internal_iter->AddTimedPut("c", "2", /*write_unix_time=*/0); + internal_iter->AddTimedPut("d", "3", /*write_unix_time=*/0); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 7 /* sequence */, /*max_sequential_skip_in_iterations*/ 1, + nullptr /* read_callback */)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0,1"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "2"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "3"); + db_iter->Next(); + ASSERT_FALSE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "3"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "2"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0,1"); + db_iter->Prev(); + ASSERT_FALSE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); +} + TEST_F(DBIteratorTest, DBIterator1) { ReadOptions ro; Options options; @@ -2617,7 +2678,7 @@ class DBIterWithMergeIterTest : public testing::Test { child_iters.push_back(internal_iter2_); InternalKeyComparator icomp(BytewiseComparator()); InternalIterator* merge_iter = - NewMergingIterator(&icomp_, &child_iters[0], 2u); + NewMergingIterator(&icomp_, child_iters.data(), 2u); db_iter_.reset(NewDBIterator( env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_), @@ -2825,7 +2886,7 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace4) { // Seek() and before calling Prev() ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "MergeIterator::Prev:BeforePrev", [&](void* arg) { - IteratorWrapper* it = reinterpret_cast(arg); + IteratorWrapper* it = static_cast(arg); if (it->key().starts_with("z")) { internal_iter2_->Add("x", kTypeValue, "7", 16u, true); internal_iter2_->Add("x", kTypeValue, "7", 15u, true); @@ -2876,7 +2937,7 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace5) { // Seek() and before calling Prev() ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "MergeIterator::Prev:BeforePrev", [&](void* arg) { - IteratorWrapper* it = reinterpret_cast(arg); + IteratorWrapper* it = static_cast(arg); if (it->key().starts_with("z")) { internal_iter2_->Add("x", kTypeValue, "7", 16u, true); internal_iter2_->Add("x", kTypeValue, "7", 15u, true); @@ -2923,7 +2984,7 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace6) { // Seek() and before calling Prev() ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "MergeIterator::Prev:BeforePrev", [&](void* arg) { - IteratorWrapper* it = reinterpret_cast(arg); + IteratorWrapper* it = static_cast(arg); if (it->key().starts_with("z")) { internal_iter2_->Add("x", kTypeValue, "7", 16u, true); } @@ -2972,7 +3033,7 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace7) { // Seek() and before calling Prev() ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "MergeIterator::Prev:BeforePrev", [&](void* arg) { - IteratorWrapper* it = reinterpret_cast(arg); + IteratorWrapper* it = static_cast(arg); if (it->key().starts_with("z")) { internal_iter2_->Add("x", kTypeValue, "7", 16u, true); internal_iter2_->Add("x", kTypeValue, "7", 15u, true); @@ -3025,7 +3086,7 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) { // before calling Prev() ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "MergeIterator::Prev:BeforePrev", [&](void* arg) { - IteratorWrapper* it = reinterpret_cast(arg); + IteratorWrapper* it = static_cast(arg); if (it->key().starts_with("z")) { internal_iter2_->Add("x", kTypeValue, "7", 16u, true); internal_iter2_->Add("y", kTypeValue, "7", 17u, true); diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index a29aab6d141..dfbdf5ceae2 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -86,15 +86,15 @@ TEST_F(DBIteratorBaseTest, APICallsWithPerfContext) { class DBIteratorTest : public DBIteratorBaseTest, public testing::WithParamInterface { public: - DBIteratorTest() {} + DBIteratorTest() = default; Iterator* NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family = nullptr) { if (column_family == nullptr) { column_family = db_->DefaultColumnFamily(); } - auto* cfd = - static_cast_with_check(column_family)->cfd(); + auto* cfh = static_cast_with_check(column_family); + auto* cfd = cfh->cfd(); SequenceNumber seq = read_options.snapshot != nullptr ? read_options.snapshot->GetSequenceNumber() : db_->GetLatestSequenceNumber(); @@ -109,7 +109,7 @@ class DBIteratorTest : public DBIteratorBaseTest, } DBImpl* db_impl = dbfull(); SuperVersion* super_version = cfd->GetReferencedSuperVersion(db_impl); - return db_impl->NewIteratorImpl(read_options, cfd, super_version, seq, + return db_impl->NewIteratorImpl(read_options, cfh, super_version, seq, read_callback); } @@ -142,6 +142,13 @@ TEST_P(DBIteratorTest, IteratorProperty) { // Get internal key at which the iteration stopped (tombstone in this case). ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value)); ASSERT_EQ("2", prop_value); + + prop_value.clear(); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.write-time", &prop_value)); + uint64_t write_time; + Slice prop_slice = prop_value; + ASSERT_TRUE(GetFixed64(&prop_slice, &write_time)); + ASSERT_EQ(std::numeric_limits::max(), write_time); } Close(); } @@ -2544,7 +2551,7 @@ TEST_P(DBIteratorTest, RefreshWithSnapshot) { TEST_P(DBIteratorTest, CreationFailure) { SyncPoint::GetInstance()->SetCallBack( "DBImpl::NewInternalIterator:StatusCallback", [](void* arg) { - *(reinterpret_cast(arg)) = Status::Corruption("test status"); + *(static_cast(arg)) = Status::Corruption("test status"); }); SyncPoint::GetInstance()->EnableProcessing(); @@ -3225,13 +3232,13 @@ TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) { ASSERT_OK(Put("bar", "v7")); SequenceNumber seq2 = db_->GetLatestSequenceNumber(); - auto* cfd = - static_cast_with_check(db_->DefaultColumnFamily()) - ->cfd(); + auto* cfh = static_cast_with_check( + db_->DefaultColumnFamily()); + auto* cfd = cfh->cfd(); // The iterator are suppose to see data before seq1. DBImpl* db_impl = dbfull(); SuperVersion* super_version = cfd->GetReferencedSuperVersion(db_impl); - Iterator* iter = db_impl->NewIteratorImpl(ReadOptions(), cfd, super_version, + Iterator* iter = db_impl->NewIteratorImpl(ReadOptions(), cfh, super_version, seq2, &callback1); // Seek @@ -3311,7 +3318,7 @@ TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) { // The iterator is suppose to see data before seq3. super_version = cfd->GetReferencedSuperVersion(db_impl); - iter = db_impl->NewIteratorImpl(ReadOptions(), cfd, super_version, seq4, + iter = db_impl->NewIteratorImpl(ReadOptions(), cfh, super_version, seq4, &callback2); // Seek to "z", which is visible. iter->Seek("z"); @@ -3467,8 +3474,7 @@ TEST_F(DBIteratorTest, ErrorWhenReadFile) { SyncPoint::GetInstance()->SetCallBack( "RandomAccessFileReader::Read::BeforeReturn", [&error_file](void* io_s_ptr) { - auto p = - reinterpret_cast*>(io_s_ptr); + auto p = static_cast*>(io_s_ptr); if (p->first->find(error_file) != std::string::npos) { *p->second = IOStatus::IOError(); p->second->SetRetryable(true); @@ -3548,8 +3554,7 @@ TEST_F(DBIteratorTest, ErrorWhenReadFile) { SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "RandomAccessFileReader::Read::AnyOffset", [&f1](void* pair_ptr) { - auto p = - reinterpret_cast*>(pair_ptr); + auto p = static_cast*>(pair_ptr); if (p->first->find(f1) != std::string::npos) { *p->second = IOStatus::IOError(); p->second->SetRetryable(true); diff --git a/db/db_kv_checksum_test.cc b/db/db_kv_checksum_test.cc index 614399243e5..d3108e73529 100644 --- a/db/db_kv_checksum_test.cc +++ b/db/db_kv_checksum_test.cc @@ -437,14 +437,13 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) { // This callback should only be called by the leader thread SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) { - auto* leader = reinterpret_cast(arg_leader); + auto* leader = static_cast(arg_leader); ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER); // This callback should only be called by the follower thread SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) { - auto* follower = - reinterpret_cast(arg_follower); + auto* follower = static_cast(arg_follower); // The leader thread will wait on this bool and hence wait until // this writer joins the write group ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER); @@ -549,14 +548,13 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) { // This callback should only be called by the leader thread SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) { - auto* leader = reinterpret_cast(arg_leader); + auto* leader = static_cast(arg_leader); ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER); // This callback should only be called by the follower thread SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) { - auto* follower = - reinterpret_cast(arg_follower); + auto* follower = static_cast(arg_follower); // The leader thread will wait on this bool and hence wait until // this writer joins the write group ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER); @@ -658,11 +656,11 @@ TEST_F(DbKVChecksumWALToWriteBatchTest, WriteBatchChecksumHandoff) { Options options = CurrentOptions(); Reopen(options); ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); - std::string content = ""; + std::string content; SyncPoint::GetInstance()->SetCallBack( "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", [&](void* batch_ptr) { - WriteBatch* batch = reinterpret_cast(batch_ptr); + WriteBatch* batch = static_cast(batch_ptr); content.assign(batch->Data().data(), batch->GetDataSize()); Slice batch_content = batch->Data(); // Corrupt first bit @@ -672,7 +670,7 @@ TEST_F(DbKVChecksumWALToWriteBatchTest, WriteBatchChecksumHandoff) { "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum", [&](void* checksum_ptr) { // Verify that checksum is produced on the batch content - uint64_t checksum = *reinterpret_cast(checksum_ptr); + uint64_t checksum = *static_cast(checksum_ptr); ASSERT_EQ(checksum, XXH3_64bits(content.data(), content.size())); }); SyncPoint::GetInstance()->EnableProcessing(); diff --git a/db/db_log_iter_test.cc b/db/db_log_iter_test.cc index 87313971a51..ae37e7bb6e9 100644 --- a/db/db_log_iter_test.cc +++ b/db/db_log_iter_test.cc @@ -14,6 +14,7 @@ #include "db/db_test_util.h" #include "env/mock_env.h" #include "port/stack_trace.h" +#include "util/atomic.h" namespace ROCKSDB_NAMESPACE { @@ -147,6 +148,7 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) { } TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckWhenArchive) { + RelaxedAtomic callback_hit{}; do { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); Options options = OptionsForLogIterTest(); @@ -168,17 +170,23 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckWhenArchive) { ASSERT_OK(dbfull()->Put(WriteOptions(), "key4", DummyString(1024))); ASSERT_OK(dbfull()->Flush(FlushOptions())); + callback_hit.StoreRelaxed(false); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WalManager::PurgeObsoleteFiles:1", [&](void*) { auto iter = OpenTransactionLogIter(0); ExpectRecords(4, iter); + callback_hit.StoreRelaxed(true); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(dbfull()->Flush(FlushOptions(), cf)); - delete cf; + // Normally hit several times; WART: perhaps more in parallel after flush + // FIXME: this test is flaky + // ASSERT_TRUE(callback_hit.LoadRelaxed()); } while (ChangeCompactOptions()); + Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } #endif @@ -236,7 +244,7 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) { ASSERT_OK(test::TruncateFile(env_, logfile_path, wal_files.front()->SizeFileBytes() / 2)); - ASSERT_OK(db_->EnableFileDeletions(/*force=*/false)); + ASSERT_OK(db_->EnableFileDeletions()); // Insert a new entry to a new log file ASSERT_OK(Put("key1025", DummyString(10))); diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc index cae592db365..385ccb43c75 100644 --- a/db/db_memtable_test.cc +++ b/db/db_memtable_test.cc @@ -121,7 +121,7 @@ class TestPrefixExtractor : public SliceTransform { private: const char* separator(const Slice& key) const { - return reinterpret_cast(memchr(key.data(), '_', key.size())); + return static_cast(memchr(key.data(), '_', key.size())); } }; @@ -287,7 +287,7 @@ TEST_F(DBMemTableTest, InsertWithHint) { options.env = env_; Reopen(options); MockMemTableRep* rep = - reinterpret_cast(options.memtable_factory.get()) + static_cast(options.memtable_factory.get()) ->rep(); ASSERT_OK(Put("foo_k1", "foo_v1")); ASSERT_EQ(nullptr, rep->last_hint_in()); diff --git a/db/db_merge_operand_test.cc b/db/db_merge_operand_test.cc index b6b9ff2afe1..e246c127889 100644 --- a/db/db_merge_operand_test.cc +++ b/db/db_merge_operand_test.cc @@ -430,6 +430,111 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsLargeResultOptimization) { } } +TEST_F(DBMergeOperandTest, GetMergeOperandsShortCircuitInMemtable) { + const int kNumOperands = 10; + const int kNumOperandsToFetch = 5; + + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + DestroyAndReopen(options); + + Random rnd(301); + std::vector expected_merge_operands; + expected_merge_operands.reserve(kNumOperands); + for (int i = 0; i < kNumOperands; ++i) { + expected_merge_operands.emplace_back(rnd.RandomString(7 /* len */)); + ASSERT_OK(Merge("key", expected_merge_operands.back())); + } + + std::vector merge_operands(kNumOperands); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = kNumOperands; + int num_fetched = 0; + merge_operands_info.continue_cb = [&](Slice /* value */) { + num_fetched++; + return num_fetched != kNumOperandsToFetch; + }; + int num_merge_operands = 0; + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "key", merge_operands.data(), + &merge_operands_info, &num_merge_operands)); + ASSERT_EQ(kNumOperandsToFetch, num_merge_operands); + ASSERT_EQ(kNumOperandsToFetch, num_fetched); + + for (int i = 0; i < kNumOperandsToFetch; ++i) { + ASSERT_EQ(expected_merge_operands[kNumOperands - kNumOperandsToFetch + i], + merge_operands[i]); + } +} + +TEST_F(DBMergeOperandTest, GetMergeOperandsShortCircuitBaseValue) { + // The continuation callback doesn't need to be called on a base value because + // there's no remaining work to be saved. + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + DestroyAndReopen(options); + + Random rnd(301); + std::string expected_value = rnd.RandomString(7 /* len */); + ASSERT_OK(Put("key", expected_value)); + + std::vector merge_operands(1); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = 1; + int num_fetched = 0; + merge_operands_info.continue_cb = [&num_fetched](Slice /* value */) { + num_fetched++; + return true; + }; + int num_merge_operands = 0; + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "key", merge_operands.data(), + &merge_operands_info, &num_merge_operands)); + ASSERT_EQ(1, num_merge_operands); + ASSERT_EQ(0, num_fetched); + + ASSERT_EQ(expected_value, merge_operands[0]); +} + +TEST_F(DBMergeOperandTest, GetMergeOperandsShortCircuitInSst) { + const int kNumOperands = 10; + const int kNumOperandsToFetch = 5; + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + DestroyAndReopen(options); + + Random rnd(301); + std::vector expected_merge_operands; + expected_merge_operands.reserve(kNumOperands); + for (int i = 0; i < kNumOperands; ++i) { + expected_merge_operands.emplace_back(rnd.RandomString(7 /* len */)); + ASSERT_OK(Merge("key", expected_merge_operands.back())); + ASSERT_OK(Flush()); + } + + std::vector merge_operands(kNumOperands); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = kNumOperands; + int num_fetched = 0; + merge_operands_info.continue_cb = [&](Slice /* value */) { + num_fetched++; + return num_fetched != kNumOperandsToFetch; + }; + int num_merge_operands = 0; + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "key", merge_operands.data(), + &merge_operands_info, &num_merge_operands)); + ASSERT_EQ(kNumOperandsToFetch, num_merge_operands); + ASSERT_EQ(kNumOperandsToFetch, num_fetched); + + for (int i = 0; i < kNumOperandsToFetch; ++i) { + ASSERT_EQ(expected_merge_operands[kNumOperands - kNumOperandsToFetch + i], + merge_operands[i]); + } +} + TEST_F(DBMergeOperandTest, GetMergeOperandsBaseDeletionInImmMem) { // In this test, "k1" has a MERGE in a mutable memtable on top of a base // DELETE in an immutable memtable. diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 8f60f0051b9..1be7a5064b7 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -1146,11 +1146,11 @@ TEST_F(DBOptionsTest, OffpeakTimes) { "1:0000000000000-2:000000000042", // Weird, but we can parse the int. }; - for (std::string invalid_case : invalid_cases) { + for (const std::string& invalid_case : invalid_cases) { options.daily_offpeak_time_utc = invalid_case; verify_invalid(); } - for (std::string valid_case : valid_cases) { + for (const std::string& valid_case : valid_cases) { options.daily_offpeak_time_utc = valid_case; verify_valid(); } @@ -1390,7 +1390,7 @@ TEST_F(DBOptionsTest, ChangeCompression) { bool compacted = false; SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* c = reinterpret_cast(arg); + Compaction* c = static_cast(arg); compression_used = c->output_compression(); compression_opt_used = c->output_compression_opts(); compacted = true; diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index e761f96d9ce..d8e8a2272d3 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -7,9 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include - #include +#include #include #include "db/db_test_util.h" @@ -107,12 +106,18 @@ TEST_F(DBPropertiesTest, Empty) { dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); ASSERT_EQ("0", num); - ASSERT_OK(db_->EnableFileDeletions(/*force=*/false)); + ASSERT_OK(db_->EnableFileDeletions()); ASSERT_TRUE( dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); ASSERT_EQ("0", num); - ASSERT_OK(db_->EnableFileDeletions(/*force=*/true)); + ASSERT_OK(db_->EnableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("0", num); + // File deletion enabled after `EnableFileDeletions` called as many times + // as `DisableFileDeletions`. + ASSERT_OK(db_->EnableFileDeletions()); ASSERT_TRUE( dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); ASSERT_EQ("1", num); @@ -1744,7 +1749,7 @@ TEST_F(DBPropertiesTest, SstFilesSize) { ASSERT_EQ(obsolete_sst_size, sst_size); // Let the obsolete files be deleted. - ASSERT_OK(db_->EnableFileDeletions(/*force=*/false)); + ASSERT_OK(db_->EnableFileDeletions()); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kObsoleteSstFilesSize, &obsolete_sst_size)); ASSERT_EQ(obsolete_sst_size, 0); diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 003117eec9a..f92fa27aed5 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -22,7 +22,7 @@ class DBRangeDelTest : public DBTestBase { uint64_t uint64_key = static_cast(key); std::string str; str.resize(8); - memcpy(&str[0], static_cast(&uint64_key), 8); + memcpy(str.data(), static_cast(&uint64_key), 8); return str; } }; @@ -1036,9 +1036,6 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) { test::NewSpecialSkipListFactory(2 /* num_entries_flush */)); // max file size could be 2x of target file size, so set it to half of that options.target_file_size_base = kValueBytes / 2; - // disable dynamic_file_size, as it will cut L1 files into more files (than - // kNumFilesPerLevel). - options.level_compaction_dynamic_file_size = false; options.max_compaction_bytes = 1500; // i == 0: CompactFiles // i == 1: CompactRange @@ -1107,14 +1104,9 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { options.level0_file_num_compaction_trigger = kNumFilesPerLevel; options.memtable_factory.reset( test::NewSpecialSkipListFactory(2 /* num_entries_flush */)); - options.target_file_size_base = kValueBytes; + // Compaction can generate files of size at most 2 * target_file_size_base. + options.target_file_size_base = kValueBytes / 2; options.disable_auto_compactions = true; - // disable it for now, otherwise the L1 files are going be cut before data 1: - // L1: [0] [1,4] - // L2: [0,0] - // because the grandparent file is between [0]->[1] and it's size is more than - // 1/8 of target size (4k). - options.level_compaction_dynamic_file_size = false; DestroyAndReopen(options); @@ -1154,6 +1146,13 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { // [key000000#1,1, key000000#1,1] MoveFilesToLevel(1); ASSERT_EQ(2, NumTableFilesAtLevel(1)); + std::vector> files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + ASSERT_EQ( + files[1][0].largest.Encode(), + InternalKey(Key(2), kMaxSequenceNumber, kTypeRangeDeletion).Encode()); + ASSERT_EQ(files[1][1].smallest.Encode(), + InternalKey(Key(2), 6, kTypeValue).Encode()); { // Compact the second sstable in L1: @@ -1172,6 +1171,12 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { ASSERT_EQ(1, NumTableFilesAtLevel(1)); ASSERT_EQ(2, NumTableFilesAtLevel(2)); ASSERT_EQ(value, Get(Key(2))); + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + ASSERT_EQ(files[2][1].smallest.Encode(), + InternalKey(Key(2), 6, kTypeValue).Encode()); + ASSERT_EQ( + files[2][1].largest.Encode(), + InternalKey(Key(4), kMaxSequenceNumber, kTypeRangeDeletion).Encode()); } { @@ -1190,6 +1195,20 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, &begin)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); ASSERT_EQ(3, NumTableFilesAtLevel(2)); + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + ASSERT_EQ( + files[2][0].largest.Encode(), + InternalKey(Key(1), kMaxSequenceNumber, kTypeRangeDeletion).Encode()); + ASSERT_EQ(files[2][1].smallest.Encode(), + InternalKey(Key(1), 5, kTypeValue).Encode()); + ASSERT_EQ( + files[2][1].largest.Encode(), + InternalKey(Key(2), kMaxSequenceNumber, kTypeRangeDeletion).Encode()); + ASSERT_EQ(files[2][2].smallest.Encode(), + InternalKey(Key(2), 6, kTypeValue).Encode()); + ASSERT_EQ( + files[2][2].largest.Encode(), + InternalKey(Key(4), kMaxSequenceNumber, kTypeRangeDeletion).Encode()); } db_->ReleaseSnapshot(snapshot); @@ -2309,13 +2328,13 @@ TEST_F(DBRangeDelTest, NonOverlappingTombstonAtBoundary) { // Test set up: // L1_0: 1, 3, [4, 7) L1_1: 6, 8, [4, 7) // L2: 5 + // L1_0's largest key: Key(6)@kMaxSequenceNumber with type kTypeRangeDeletion // Note that [4, 7) is at end of L1_0 and not overlapping with any point key - // in L1_0. [4, 7) from L1_0 should cover 5 is sentinel works + // in L1_0. [4, 7) from L1_0 should cover 5 if sentinel in LevelIterator works Options options = CurrentOptions(); options.compression = kNoCompression; options.disable_auto_compactions = true; - options.target_file_size_base = 2 * 1024; - options.level_compaction_dynamic_file_size = false; + options.target_file_size_base = 4 * 1024; DestroyAndReopen(options); Random rnd(301); @@ -2335,6 +2354,7 @@ TEST_F(DBRangeDelTest, NonOverlappingTombstonAtBoundary) { ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10))); ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(4 << 10))); // Prevent keys being compacted away + const Snapshot* snapshot = db_->GetSnapshot(); ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4), Key(7))); ASSERT_OK(db_->Flush(FlushOptions())); @@ -2342,6 +2362,11 @@ TEST_F(DBRangeDelTest, NonOverlappingTombstonAtBoundary) { MoveFilesToLevel(1); ASSERT_EQ(2, NumTableFilesAtLevel(1)); + std::vector> files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + InternalKey ik = InternalKey(Key(6), kMaxSequenceNumber, kTypeRangeDeletion); + ASSERT_EQ(files[1][0].largest.Encode(), ik.Encode()); + auto iter = db_->NewIterator(ReadOptions()); iter->Seek(Key(3)); ASSERT_TRUE(iter->Valid()); @@ -2361,6 +2386,7 @@ TEST_F(DBRangeDelTest, NonOverlappingTombstonAtBoundary) { ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1); } delete iter; + db_->ReleaseSnapshot(snapshot); } TEST_F(DBRangeDelTest, OlderLevelHasNewerData) { @@ -2807,8 +2833,8 @@ TEST_F(DBRangeDelTest, LeftSentinelKeyTestWithNewerKey) { Arena arena; InternalKeyComparator icmp(options.comparator); ReadOptions read_options; - ScopedArenaIterator iter; - iter.set( + ScopedArenaPtr iter; + iter.reset( dbfull()->NewInternalIterator(read_options, &arena, kMaxSequenceNumber)); auto k = Key(4); @@ -3126,14 +3152,18 @@ TEST_F(DBRangeDelTest, RangetombesoneCompensateFilesizePersistDuringReopen) { TEST_F(DBRangeDelTest, SingleKeyFile) { // Test for a bug fix where a range tombstone could be added // to an SST file while is not within the file's key range. - // Create 3 files in L0 and then L1 where all keys have the same user key - // `Key(2)`. The middle file will contain Key(2)@6 and Key(2)@5. Before fix, - // the range tombstone [Key(2), Key(5))@2 would be added to this file during - // compaction, but it is not in this file's key range. + // Create 3 files in L0 and then compact them to L1 where all keys have the + // same user key `Key(2)`. + // L0_0: Key(2)@5 + // L0_1: Key(2)@4 + // L0_2: Key(2)@3, range tombstone [Key(2), Key(5))@2 + // + // After compaction, the first output file contains Key(2)@5 and Key(2)@4. + // Before fix, the range tombstone [Key(2), Key(5))@2 would be added to this + // file during compaction, but it is not in this file's key range. Options opts = CurrentOptions(); opts.disable_auto_compactions = true; opts.target_file_size_base = 1 << 10; - opts.level_compaction_dynamic_file_size = false; DestroyAndReopen(opts); // prevent range tombstone drop @@ -3178,6 +3208,12 @@ TEST_F(DBRangeDelTest, SingleKeyFile) { std::numeric_limits::max() /*max_file_num_to_ignore*/, "" /*trim_ts*/)); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + std::vector> files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + ASSERT_EQ(files[1][0].largest.Encode(), + InternalKey(Key(2), 4, kTypeValue).Encode()); + for (const auto s : snapshots) { db_->ReleaseSnapshot(s); } @@ -3248,13 +3284,12 @@ TEST_F(DBRangeDelTest, AddRangeDelsSameLowerAndUpperBound) { Options opts = CurrentOptions(); opts.disable_auto_compactions = true; opts.target_file_size_base = 1 << 10; - opts.level_compaction_dynamic_file_size = false; DestroyAndReopen(opts); Random rnd(301); // Create file at bottommost level so the manual compaction below is - // non-bottommost level and goes through code path like compensate range - // tombstone size. + // non-bottommost level and goes through code path in + // versions->ApproximateSize() to calculate compensated range tombstone size ASSERT_OK(Put(Key(1), "v1")); ASSERT_OK(Put(Key(4), "v2")); ASSERT_OK(Flush()); @@ -3277,6 +3312,12 @@ TEST_F(DBRangeDelTest, AddRangeDelsSameLowerAndUpperBound) { // File 1: Key(1)@1, Key(3)@6, DeleteRange ends at Key(3)@6 // File 2: Key(3)@4, Key(4)@7, DeleteRange start from Key(3)@4 ASSERT_EQ(NumTableFilesAtLevel(1), 2); + std::vector> files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + ASSERT_EQ(files[1][0].largest.Encode(), + InternalKey(Key(3), 6, kTypeValue).Encode()); + ASSERT_EQ(files[1][1].smallest.Encode(), + InternalKey(Key(3), 4, kTypeValue).Encode()); // Manually update compaction output file cutting decisions // to cut before range tombstone sentinel Key(3)@4 @@ -3325,7 +3366,6 @@ TEST_F(DBRangeDelTest, AddRangeDelsSingleUserKeyTombstoneOnlyFile) { Options opts = CurrentOptions(); opts.disable_auto_compactions = true; opts.target_file_size_base = 1 << 10; - opts.level_compaction_dynamic_file_size = false; DestroyAndReopen(opts); Random rnd(301); @@ -3486,7 +3526,6 @@ TEST_F(DBRangeDelTest, NonBottommostCompactionDropRangetombstone) { TEST_F(DBRangeDelTest, MemtableMaxRangeDeletions) { // Tests option `memtable_max_range_deletions`. Options options = CurrentOptions(); - options.level_compaction_dynamic_file_size = false; options.memtable_max_range_deletions = 50; options.level0_file_num_compaction_trigger = 5; DestroyAndReopen(options); @@ -3566,8 +3605,7 @@ TEST_F(DBRangeDelTest, RangeDelReseekAfterFileReadError) { SyncPoint::GetInstance()->SetCallBack( "RandomAccessFileReader::Read::BeforeReturn", [&fname](void* pair_ptr) { - auto p = - reinterpret_cast*>(pair_ptr); + auto p = static_cast*>(pair_ptr); if (p->first->find(fname) != std::string::npos) { *p->second = IOStatus::IOError(); p->second->SetRetryable(true); @@ -3627,8 +3665,7 @@ TEST_F(DBRangeDelTest, RangeDelReseekAfterFileReadError) { SyncPoint::GetInstance()->SetCallBack( "RandomAccessFileReader::Read::AnyOffset", [&fname](void* pair_ptr) { - auto p = - reinterpret_cast*>(pair_ptr); + auto p = static_cast*>(pair_ptr); if (p->first->find(fname) != std::string::npos) { *p->second = IOStatus::IOError(); p->second->SetRetryable(true); diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 987756906e2..6c33d41dfe5 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -103,7 +103,7 @@ void DBSecondaryTestBase::CheckFileTypeCounts(const std::string& dir, ASSERT_OK(env_->GetChildren(dir, &filenames)); int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; - for (auto file : filenames) { + for (const auto& file : filenames) { uint64_t number; FileType type; if (ParseFileName(file, &number, &type)) { @@ -131,7 +131,7 @@ TEST_F(DBSecondaryTest, FailOpenIfLoggerCreationFail) { SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "rocksdb::CreateLoggerFromOptions:AfterGetPath", [&](void* arg) { - auto* s = reinterpret_cast(arg); + auto* s = static_cast(arg); assert(s); *s = Status::IOError("Injected"); }); @@ -1191,7 +1191,7 @@ TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) { "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) { ASSERT_NE(nullptr, arg); called = true; - auto* s = reinterpret_cast(arg); + auto* s = static_cast(arg); ASSERT_NOK(*s); }); SyncPoint::GetInstance()->LoadDependency( @@ -1229,8 +1229,7 @@ TEST_F(DBSecondaryTest, StartFromInconsistent) { SyncPoint::GetInstance()->SetCallBack( "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { ASSERT_NE(nullptr, arg); - *(reinterpret_cast(arg)) = - Status::Corruption("Inject corruption"); + *(static_cast(arg)) = Status::Corruption("Inject corruption"); }); SyncPoint::GetInstance()->EnableProcessing(); Options options1; @@ -1263,8 +1262,7 @@ TEST_F(DBSecondaryTest, InconsistencyDuringCatchUp) { SyncPoint::GetInstance()->SetCallBack( "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { ASSERT_NE(nullptr, arg); - *(reinterpret_cast(arg)) = - Status::Corruption("Inject corruption"); + *(static_cast(arg)) = Status::Corruption("Inject corruption"); }); SyncPoint::GetInstance()->EnableProcessing(); Status s = db_secondary_->TryCatchUpWithPrimary(); diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 7590aa2f113..d7eee829bee 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -27,8 +27,8 @@ class DBSSTTest : public DBTestBase { // A class which remembers the name of each flushed file. class FlushedFileCollector : public EventListener { public: - FlushedFileCollector() {} - ~FlushedFileCollector() override {} + FlushedFileCollector() = default; + ~FlushedFileCollector() override = default; void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { std::lock_guard lock(mutex_); @@ -38,7 +38,7 @@ class FlushedFileCollector : public EventListener { std::vector GetFlushedFiles() { std::lock_guard lock(mutex_); std::vector result; - for (auto fname : flushed_files_) { + for (const auto& fname : flushed_files_) { result.push_back(fname); } return result; @@ -661,7 +661,7 @@ class DBSSTTestRateLimit : public DBSSTTest, public ::testing::WithParamInterface { public: DBSSTTestRateLimit() : DBSSTTest() {} - ~DBSSTTestRateLimit() override {} + ~DBSSTTestRateLimit() override = default; }; TEST_P(DBSSTTestRateLimit, RateLimitedDelete) { @@ -957,15 +957,18 @@ TEST_F(DBSSTTest, OpenDBWithExistingTrashAndObsoleteSstFile) { // Add some trash files to the db directory so the DB can clean them up ASSERT_OK(env_->CreateDirIfMissing(dbname_)); - ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash")); - ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash")); - ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash")); + ASSERT_OK( + WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash", false)); + ASSERT_OK( + WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash", false)); + ASSERT_OK( + WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash", false)); // Manually add an obsolete sst file. Obsolete SST files are discovered and // deleted upon recovery. constexpr uint64_t kSstFileNumber = 100; const std::string kObsoleteSstFile = MakeTableFileName(dbname_, kSstFileNumber); - ASSERT_OK(WriteStringToFile(env_, "abc", kObsoleteSstFile)); + ASSERT_OK(WriteStringToFile(env_, "abc", kObsoleteSstFile, false)); // Reopen the DB and verify that it deletes existing trash files and obsolete // SST files with rate limiting. @@ -1090,10 +1093,10 @@ TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) { int num_wal_files = 0; std::vector db_files; ASSERT_OK(env_->GetChildren(dbname_, &db_files)); - for (std::string f : db_files) { - if (f.substr(f.find_last_of(".") + 1) == "sst") { + for (const std::string& f : db_files) { + if (f.substr(f.find_last_of('.') + 1) == "sst") { num_sst_files++; - } else if (f.substr(f.find_last_of(".") + 1) == "log") { + } else if (f.substr(f.find_last_of('.') + 1) == "log") { num_wal_files++; } } @@ -1676,7 +1679,7 @@ TEST_F(DBSSTTest, OpenDBWithoutGetFileSizeInvocations) { bool is_get_file_size_called = false; SyncPoint::GetInstance()->SetCallBack( "MockFileSystem::GetFileSize:CheckFileType", [&](void* arg) { - std::string* filename = reinterpret_cast(arg); + std::string* filename = static_cast(arg); if (filename->find(".blob") != std::string::npos) { is_get_file_size_called = true; } diff --git a/db/db_statistics_test.cc b/db/db_statistics_test.cc index 054fbc56c72..f430811d3e8 100644 --- a/db/db_statistics_test.cc +++ b/db/db_statistics_test.cc @@ -6,9 +6,11 @@ #include #include "db/db_test_util.h" +#include "db/write_batch_internal.h" #include "monitoring/thread_status_util.h" #include "port/stack_trace.h" #include "rocksdb/statistics.h" +#include "rocksdb/utilities/transaction_db.h" #include "util/random.h" namespace ROCKSDB_NAMESPACE { @@ -283,6 +285,78 @@ TEST_F(DBStatisticsTest, BlockChecksumStats) { options.statistics->getTickerCount(BLOCK_CHECKSUM_MISMATCH_COUNT)); } +TEST_F(DBStatisticsTest, BytesWrittenStats) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kExceptHistogramOrTimers); + Reopen(options); + + EXPECT_EQ(0, options.statistics->getAndResetTickerCount(WAL_FILE_BYTES)); + EXPECT_EQ(0, options.statistics->getAndResetTickerCount(BYTES_WRITTEN)); + + const int kNumKeysWritten = 100; + + // Scenario 0: Not using transactions. + // This will write to WAL and memtable directly. + ASSERT_OK(options.statistics->Reset()); + + for (int i = 0; i < kNumKeysWritten; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + + EXPECT_EQ(options.statistics->getAndResetTickerCount(WAL_FILE_BYTES), + options.statistics->getAndResetTickerCount(BYTES_WRITTEN)); + + // Scenario 1: Using transactions. + // This should not double count BYTES_WRITTEN (issue #12061). + for (bool enable_pipelined_write : {false, true}) { + ASSERT_OK(options.statistics->Reset()); + + // Destroy the DB to recreate as a TransactionDB. + Destroy(options, true); + + // Create a TransactionDB. + TransactionDB* txn_db = nullptr; + TransactionDBOptions txn_db_opts; + txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED; + options.enable_pipelined_write = enable_pipelined_write; + ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db)); + ASSERT_NE(txn_db, nullptr); + db_ = txn_db->GetBaseDB(); + + WriteOptions wopts; + TransactionOptions txn_opts; + Transaction* txn = txn_db->BeginTransaction(wopts, txn_opts, nullptr); + ASSERT_NE(txn, nullptr); + ASSERT_OK(txn->SetName("txn1")); + + for (int i = 0; i < kNumKeysWritten; ++i) { + ASSERT_OK(txn->Put(Key(i), "val")); + } + + // Prepare() writes to WAL, but not to memtable. (WriteCommitted) + ASSERT_OK(txn->Prepare()); + EXPECT_NE(0, options.statistics->getTickerCount(WAL_FILE_BYTES)); + // BYTES_WRITTEN would have been non-zero previously (issue #12061). + EXPECT_EQ(0, options.statistics->getTickerCount(BYTES_WRITTEN)); + + // Commit() writes to memtable and also a commit marker to WAL. + ASSERT_OK(txn->Commit()); + delete txn; + + // The WAL has an extra header of size `kHeader` written to it, + // as we are writing twice to it (first during Prepare, second during + // Commit). + EXPECT_EQ(options.statistics->getAndResetTickerCount(WAL_FILE_BYTES), + options.statistics->getAndResetTickerCount(BYTES_WRITTEN) + + WriteBatchInternal::kHeader); + + // Cleanup + db_ = nullptr; + delete txn_db; + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc index 61dcf3c1e1d..dca36721f89 100644 --- a/db/db_table_properties_test.cc +++ b/db/db_table_properties_test.cc @@ -22,6 +22,7 @@ #include "table/table_properties_internal.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/atomic.h" #include "util/random.h" @@ -58,9 +59,6 @@ class DBTablePropertiesTest : public DBTestBase, public: DBTablePropertiesTest() : DBTestBase("db_table_properties_test", /*env_do_fsync=*/false) {} - TablePropertiesCollection TestGetPropertiesOfTablesInRange( - std::vector ranges, std::size_t* num_properties = nullptr, - std::size_t* num_files = nullptr); }; TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { @@ -76,8 +74,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { if (table == 3) { SyncPoint::GetInstance()->SetCallBack( "BlockBasedTableBuilder::WritePropertiesBlock:Meta", [&](void* meta) { - *reinterpret_cast(meta) = - &kPropertiesBlockOldName; + *static_cast(meta) = &kPropertiesBlockOldName; }); SyncPoint::GetInstance()->EnableProcessing(); } @@ -95,7 +92,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { // Part of strategy to prevent pinning table files SyncPoint::GetInstance()->SetCallBack( "VersionEditHandler::LoadTables:skip_load_table_files", - [&](void* skip_load) { *reinterpret_cast(skip_load) = true; }); + [&](void* skip_load) { *static_cast(skip_load) = true; }); SyncPoint::GetInstance()->EnableProcessing(); // 1. Read table properties directly from file @@ -180,9 +177,7 @@ TEST_F(DBTablePropertiesTest, InvalidIgnored) { // Inject properties block data that Block considers invalid SyncPoint::GetInstance()->SetCallBack( "BlockBasedTableBuilder::WritePropertiesBlock:BlockData", - [&](void* block_data) { - *reinterpret_cast(block_data) = Slice("X"); - }); + [&](void* block_data) { *static_cast(block_data) = Slice("X"); }); SyncPoint::GetInstance()->EnableProcessing(); // Corrupting the table properties corrupts the unique id. @@ -235,49 +230,109 @@ TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) { ASSERT_EQ(0.5, del_factory->GetDeletionRatio()); } -TablePropertiesCollection -DBTablePropertiesTest::TestGetPropertiesOfTablesInRange( - std::vector ranges, std::size_t* num_properties, - std::size_t* num_files) { - // Since we deref zero element in the vector it can not be empty - // otherwise we pass an address to some random memory - EXPECT_GT(ranges.size(), 0U); - // run the query - TablePropertiesCollection props; - EXPECT_OK(db_->GetPropertiesOfTablesInRange( - db_->DefaultColumnFamily(), &ranges[0], ranges.size(), &props)); - - // Make sure that we've received properties for those and for those files - // only which fall within requested ranges - std::vector vmd; - db_->GetLiveFilesMetaData(&vmd); - for (auto& md : vmd) { - std::string fn = md.db_path + md.name; - bool in_range = false; - for (auto& r : ranges) { - // smallestkey < limit && largestkey >= start - if (r.limit.compare(md.smallestkey) >= 0 && - r.start.compare(md.largestkey) <= 0) { - in_range = true; - EXPECT_GT(props.count(fn), 0); - } +// Test params: +// 1) whether to enable user-defined timestamps +class DBTablePropertiesInRangeTest : public DBTestBase, + public testing::WithParamInterface { + public: + DBTablePropertiesInRangeTest() + : DBTestBase("db_table_properties_in_range_test", + /*env_do_fsync=*/false) {} + + void SetUp() override { enable_udt_ = GetParam(); } + + protected: + void PutKeyValue(const Slice& key, const Slice& value) { + if (enable_udt_) { + EXPECT_OK(db_->Put(WriteOptions(), key, min_ts_, value)); + } else { + EXPECT_OK(Put(key, value)); } - if (!in_range) { - EXPECT_EQ(props.count(fn), 0); + } + + std::string GetValue(const std::string& key) { + ReadOptions roptions; + std::string result; + if (enable_udt_) { + roptions.timestamp = &min_ts_; } + Status s = db_->Get(roptions, key, &result); + EXPECT_TRUE(s.ok()); + return result; } - if (num_properties) { - *num_properties = props.size(); + Status MaybeGetValue(const std::string& key, std::string* result) { + ReadOptions roptions; + if (enable_udt_) { + roptions.timestamp = &min_ts_; + } + Status s = db_->Get(roptions, key, result); + EXPECT_TRUE(s.IsNotFound() || s.ok()); + return s; } - if (num_files) { - *num_files = vmd.size(); + TablePropertiesCollection TestGetPropertiesOfTablesInRange( + std::vector ranges, std::size_t* num_properties = nullptr, + std::size_t* num_files = nullptr) { + // Since we deref zero element in the vector it can not be empty + // otherwise we pass an address to some random memory + EXPECT_GT(ranges.size(), 0U); + // run the query + TablePropertiesCollection props; + ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily(); + EXPECT_OK(db_->GetPropertiesOfTablesInRange(default_cf, ranges.data(), + ranges.size(), &props)); + + const Comparator* ucmp = default_cf->GetComparator(); + EXPECT_NE(ucmp, nullptr); + const size_t ts_sz = ucmp->timestamp_size(); + const size_t range_size = ranges.size(); + autovector ukey_ranges; + std::vector keys; + ukey_ranges.reserve(range_size); + keys.reserve(range_size * 2); + for (auto& r : ranges) { + auto [start, limit] = MaybeAddTimestampsToRange( + &r.start, &r.limit, ts_sz, &keys.emplace_back(), &keys.emplace_back(), + /*exclusive_end=*/false); + EXPECT_TRUE(start.has_value()); + EXPECT_TRUE(limit.has_value()); + ukey_ranges.emplace_back(start.value(), limit.value()); + } + // Make sure that we've received properties for those and for those files + // only which fall within requested ranges + std::vector vmd; + db_->GetLiveFilesMetaData(&vmd); + for (auto& md : vmd) { + std::string fn = md.db_path + md.name; + bool in_range = false; + for (auto& r : ukey_ranges) { + if (ucmp->Compare(r.start, md.largestkey) <= 0 && + ucmp->Compare(r.limit, md.smallestkey) >= 0) { + in_range = true; + EXPECT_GT(props.count(fn), 0); + } + } + if (!in_range) { + EXPECT_EQ(props.count(fn), 0); + } + } + + if (num_properties) { + *num_properties = props.size(); + } + + if (num_files) { + *num_files = vmd.size(); + } + return props; } - return props; -} -TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { + bool enable_udt_ = false; + Slice min_ts_ = MinU64Ts(); +}; + +TEST_P(DBTablePropertiesInRangeTest, GetPropertiesOfTablesInRange) { // Fixed random sead Random rnd(301); @@ -295,17 +350,21 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { options.hard_pending_compaction_bytes_limit = 16 * 1024; options.num_levels = 8; options.env = env_; + bool udt_enabled = GetParam(); + if (udt_enabled) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + } DestroyAndReopen(options); // build a decent LSM for (int i = 0; i < 10000; i++) { - ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102))); + PutKeyValue(test::RandomKey(&rnd, 5), rnd.RandomString(102)); } ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (NumTableFilesAtLevel(0) == 0) { - ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102))); + PutKeyValue(test::RandomKey(&rnd, 5), rnd.RandomString(102)); ASSERT_OK(Flush()); } @@ -362,7 +421,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { std::vector ranges; auto it = random_keys.begin(); while (it != random_keys.end()) { - ranges.push_back(Range(*it, *(it + 1))); + ranges.emplace_back(*it, *(it + 1)); it += 2; } @@ -370,6 +429,10 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { } } +INSTANTIATE_TEST_CASE_P(DBTablePropertiesInRangeTest, + DBTablePropertiesInRangeTest, + ::testing::Values(true, false)); + TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) { std::string kExtraCfName = "pikachu"; CreateAndReopenWithCF({kExtraCfName}, CurrentOptions()); @@ -417,6 +480,71 @@ TEST_F(DBTablePropertiesTest, GetDbIdentifiersProperty) { } } +TEST_F(DBTablePropertiesTest, FactoryReturnsNull) { + struct JunkTablePropertiesCollector : public TablePropertiesCollector { + const char* Name() const override { return "JunkTablePropertiesCollector"; } + Status Finish(UserCollectedProperties* properties) override { + properties->insert({"Junk", "Junk"}); + return Status::OK(); + } + UserCollectedProperties GetReadableProperties() const override { + return {}; + } + }; + + // Alternates between putting a "Junk" property and using `nullptr` to + // opt out. + static RelaxedAtomic count{0}; + struct SometimesTablePropertiesCollectorFactory + : public TablePropertiesCollectorFactory { + const char* Name() const override { + return "SometimesTablePropertiesCollectorFactory"; + } + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + if (count.FetchAddRelaxed(1) & 1) { + return nullptr; + } else { + return new JunkTablePropertiesCollector(); + } + } + }; + + Options options = CurrentOptions(); + options.table_properties_collector_factories.emplace_back( + std::make_shared()); + // For plain table + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + for (const std::shared_ptr& tf : + {options.table_factory, + std::shared_ptr(NewPlainTableFactory({}))}) { + SCOPED_TRACE("Table factory = " + std::string(tf->Name())); + options.table_factory = tf; + + DestroyAndReopen(options); + + ASSERT_OK(Put("key0", "value1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key0", "value2")); + ASSERT_OK(Flush()); + + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + int no_junk_count = 0; + int junk_count = 0; + for (const auto& item : props) { + if (item.second->user_collected_properties.find("Junk") != + item.second->user_collected_properties.end()) { + junk_count++; + } else { + no_junk_count++; + } + } + EXPECT_EQ(1, no_junk_count); + EXPECT_EQ(1, junk_count); + } +} + class DBTableHostnamePropertyTest : public DBTestBase, public ::testing::WithParamInterface> { diff --git a/db/db_tailing_iter_test.cc b/db/db_tailing_iter_test.cc index 07ffadc2af2..7c326519e61 100644 --- a/db/db_tailing_iter_test.cc +++ b/db/db_tailing_iter_test.cc @@ -203,13 +203,13 @@ TEST_P(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { bool file_iters_renewed_copy = false; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "ForwardIterator::SeekInternal:Return", [&](void* arg) { - ForwardIterator* fiter = reinterpret_cast(arg); + ForwardIterator* fiter = static_cast(arg); ASSERT_TRUE(!file_iters_deleted || fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters)); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "ForwardIterator::Next:Return", [&](void* arg) { - ForwardIterator* fiter = reinterpret_cast(arg); + ForwardIterator* fiter = static_cast(arg); ASSERT_TRUE(!file_iters_deleted || fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters)); }); @@ -360,8 +360,9 @@ TEST_P(DBTestTailingIterator, TailingIteratorDeletes) { // make sure we can read all new records using the existing iterator int count = 0; - for (; iter->Valid(); iter->Next(), ++count) + for (; iter->Valid(); iter->Next(), ++count) { ; + } ASSERT_OK(iter->status()); ASSERT_EQ(count, num_records); } diff --git a/db/db_test.cc b/db/db_test.cc index c0173b1fc9f..105afd4fd3e 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -59,7 +59,6 @@ #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "table/mock_table.h" -#include "table/scoped_arena_iterator.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -679,8 +678,8 @@ TEST_F(DBTest, ReadFromPersistedTier) { multiget_cfs.push_back(handles_[1]); multiget_cfs.push_back(handles_[1]); std::vector multiget_keys; - multiget_keys.push_back("foo"); - multiget_keys.push_back("bar"); + multiget_keys.emplace_back("foo"); + multiget_keys.emplace_back("bar"); std::vector multiget_values; for (int i = 0; i < 2; i++) { bool batched = i == 0; @@ -715,7 +714,7 @@ TEST_F(DBTest, ReadFromPersistedTier) { // Expect same result in multiget multiget_cfs.push_back(handles_[1]); - multiget_keys.push_back("rocksdb"); + multiget_keys.emplace_back("rocksdb"); multiget_values.clear(); for (int i = 0; i < 2; i++) { @@ -2519,7 +2518,7 @@ TEST_F(DBTest, SnapshotFiles) { } // release file snapshot - ASSERT_OK(dbfull()->EnableFileDeletions(/*force*/ false)); + ASSERT_OK(dbfull()->EnableFileDeletions()); // overwrite one key, this key should not appear in the snapshot std::vector extras; for (unsigned int i = 0; i < 1; i++) { @@ -2703,7 +2702,7 @@ TEST_F(DBTest, PurgeInfoLogs) { ASSERT_OK(env_->GetChildren( options.db_log_dir.empty() ? dbname_ : options.db_log_dir, &files)); int info_log_count = 0; - for (std::string file : files) { + for (const std::string& file : files) { if (file.find("LOG") != std::string::npos) { info_log_count++; } @@ -2721,7 +2720,7 @@ TEST_F(DBTest, PurgeInfoLogs) { if (mode == 1) { // Cleaning up ASSERT_OK(env_->GetChildren(options.db_log_dir, &files)); - for (std::string file : files) { + for (const std::string& file : files) { ASSERT_OK(env_->DeleteFile(options.db_log_dir + "/" + file)); } ASSERT_OK(env_->DeleteDir(options.db_log_dir)); @@ -2749,7 +2748,7 @@ struct MTThread { }; static void MTThreadBody(void* arg) { - MTThread* t = reinterpret_cast(arg); + MTThread* t = static_cast(arg); int id = t->id; DB* db = t->state->test->db_; int counter = 0; @@ -2881,7 +2880,9 @@ class MultiThreadedDBTest }; TEST_P(MultiThreadedDBTest, MultiThreaded) { - if (option_config_ == kPipelinedWrite) return; + if (option_config_ == kPipelinedWrite) { + return; + } anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; Options options = CurrentOptions(options_override); @@ -2932,7 +2933,7 @@ struct GCThread { }; static void GCThreadBody(void* arg) { - GCThread* t = reinterpret_cast(arg); + GCThread* t = static_cast(arg); int id = t->id; DB* db = t->db; WriteOptions wo; @@ -2980,7 +2981,7 @@ TEST_F(DBTest, GroupCommitTest) { Iterator* itr = db_->NewIterator(ReadOptions()); itr->SeekToFirst(); - for (auto x : expected_db) { + for (const auto& x : expected_db) { ASSERT_TRUE(itr->Valid()); ASSERT_EQ(itr->key().ToString(), x); ASSERT_EQ(itr->value().ToString(), x); @@ -3098,28 +3099,28 @@ class ModelDB : public DB { } using DB::Get; Status Get(const ReadOptions& /*options*/, ColumnFamilyHandle* /*cf*/, - const Slice& key, PinnableSlice* /*value*/) override { + const Slice& key, PinnableSlice* /*value*/, + std::string* /*timestamp*/) override { return Status::NotSupported(key); } using DB::GetMergeOperands; - virtual Status GetMergeOperands( - const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, - const Slice& key, PinnableSlice* /*slice*/, - GetMergeOperandsOptions* /*merge_operands_options*/, - int* /*number_of_operands*/) override { + Status GetMergeOperands(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& key, PinnableSlice* /*slice*/, + GetMergeOperandsOptions* /*merge_operands_options*/, + int* /*number_of_operands*/) override { return Status::NotSupported(key); } using DB::MultiGet; - std::vector MultiGet( - const ReadOptions& /*options*/, - const std::vector& /*column_family*/, - const std::vector& keys, - std::vector* /*values*/) override { - std::vector s(keys.size(), - Status::NotSupported("Not implemented.")); - return s; + void MultiGet(const ReadOptions& /*options*/, const size_t num_keys, + ColumnFamilyHandle** /*column_families*/, const Slice* /*keys*/, + PinnableSlice* /*values*/, std::string* /*timestamps*/, + Status* statuses, const bool /*sorted_input*/) override { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = Status::NotSupported("Not implemented."); + } } using DB::IngestExternalFile; @@ -3137,7 +3138,7 @@ class ModelDB : public DB { } using DB::CreateColumnFamilyWithImport; - virtual Status CreateColumnFamilyWithImport( + Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& /*options*/, const std::string& /*column_family_name*/, const ImportColumnFamilyOptions& /*import_options*/, @@ -3152,9 +3153,9 @@ class ModelDB : public DB { } using DB::ClipColumnFamily; - virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, - const Slice& /*begin*/, - const Slice& /*end*/) override { + Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { return Status::NotSupported("Not implemented."); } @@ -3190,7 +3191,7 @@ class ModelDB : public DB { return new ModelIter(saved, true); } else { const KVMap* snapshot_state = - &(reinterpret_cast(options.snapshot)->map_); + &(static_cast(options.snapshot)->map_); return new ModelIter(snapshot_state, false); } } @@ -3200,12 +3201,20 @@ class ModelDB : public DB { return Status::NotSupported("Not supported yet"); } + SequenceNumber GetIteratorSequenceNumber(Iterator*) override { // No support yet. assert(false); return 0; } + // UNDER CONSTRUCTION - DO NOT USE + std::unique_ptr NewMultiCfIterator( + const ReadOptions& /*options*/, + const std::vector& /*column_families*/) override { + return nullptr; + } + const Snapshot* GetSnapshot() override { ModelSnapshot* snapshot = new ModelSnapshot; snapshot->map_ = map_; @@ -3218,7 +3227,7 @@ class ModelDB : public DB { } void ReleaseSnapshot(const Snapshot* snapshot) override { - delete reinterpret_cast(snapshot); + delete static_cast(snapshot); } Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override { @@ -3316,11 +3325,11 @@ class ModelDB : public DB { return Status::NotSupported("Not supported operation."); } - void EnableManualCompaction() override { return; } + void EnableManualCompaction() override {} - void DisableManualCompaction() override { return; } + void DisableManualCompaction() override {} - virtual Status WaitForCompact( + Status WaitForCompact( const WaitForCompactOptions& /* wait_for_compact_options */) override { return Status::OK(); } @@ -3366,7 +3375,7 @@ class ModelDB : public DB { Status DisableFileDeletions() override { return Status::OK(); } - Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); } + Status EnableFileDeletions() override { return Status::OK(); } Status GetLiveFiles(std::vector&, uint64_t* /*size*/, bool /*flush_memtable*/ = true) override { @@ -3393,8 +3402,7 @@ class ModelDB : public DB { return Status::OK(); } - virtual Status GetCreationTimeOfOldestFile( - uint64_t* /*creation_time*/) override { + Status GetCreationTimeOfOldestFile(uint64_t* /*creation_time*/) override { return Status::NotSupported(); } @@ -3464,7 +3472,9 @@ class ModelDB : public DB { ModelIter(const KVMap* map, bool owned) : map_(map), owned_(owned), iter_(map_->end()) {} ~ModelIter() override { - if (owned_) delete map_; + if (owned_) { + delete map_; + } } bool Valid() const override { return iter_ != map_->end(); } void SeekToFirst() override { iter_ = map_->begin(); } @@ -3502,7 +3512,7 @@ class ModelDB : public DB { }; const Options options_; KVMap map_; - std::string name_ = ""; + std::string name_; }; #if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) @@ -3650,8 +3660,12 @@ TEST_P(DBTestRandomized, Randomized) { // Save a snapshot from each DB this time that we'll use next // time we compare things, to make sure the current state is // preserved with the snapshot - if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); - if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); + if (model_snap != nullptr) { + model.ReleaseSnapshot(model_snap); + } + if (db_snap != nullptr) { + db_->ReleaseSnapshot(db_snap); + } Reopen(options); ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); @@ -3660,8 +3674,12 @@ TEST_P(DBTestRandomized, Randomized) { db_snap = db_->GetSnapshot(); } } - if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); - if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); + if (model_snap != nullptr) { + model.ReleaseSnapshot(model_snap); + } + if (db_snap != nullptr) { + db_->ReleaseSnapshot(db_snap); + } } #endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) @@ -4290,9 +4308,9 @@ TEST_F(DBTest, DISABLED_RateLimitingTest) { // (e.g, RateLimiter::GetTotalPendingRequests()) class MockedRateLimiterWithNoOptionalAPIImpl : public RateLimiter { public: - MockedRateLimiterWithNoOptionalAPIImpl() {} + MockedRateLimiterWithNoOptionalAPIImpl() = default; - ~MockedRateLimiterWithNoOptionalAPIImpl() override {} + ~MockedRateLimiterWithNoOptionalAPIImpl() override = default; void SetBytesPerSecond(int64_t bytes_per_second) override { (void)bytes_per_second; @@ -4694,11 +4712,18 @@ void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type, int op_count = 0; std::vector thread_list; ASSERT_OK(env->GetThreadList(&thread_list)); - for (auto thread : thread_list) { + for (const auto& thread : thread_list) { if (thread.operation_type == op_type) { op_count++; } } + if (op_count != expected_count) { + for (const auto& thread : thread_list) { + fprintf(stderr, "thread id: %" PRIu64 ", thread status: %s\n", + thread.thread_id, + thread.GetOperationName(thread.operation_type).c_str()); + } + } ASSERT_EQ(op_count, expected_count); } } // anonymous namespace @@ -4734,7 +4759,7 @@ TEST_F(DBTest, GetThreadStatus) { s = env_->GetThreadList(&thread_list); ASSERT_OK(s); memset(thread_type_counts, 0, sizeof(thread_type_counts)); - for (auto thread : thread_list) { + for (const auto& thread : thread_list) { ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES); thread_type_counts[thread.thread_type]++; } @@ -5015,7 +5040,7 @@ TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) { } ASSERT_OK(env_->GetThreadList(&thread_list)); - for (auto thread : thread_list) { + for (const auto& thread : thread_list) { operation_count[thread.operation_type]++; } @@ -5040,7 +5065,7 @@ TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) { operation_count[i] = 0; } ASSERT_OK(env_->GetThreadList(&thread_list)); - for (auto thread : thread_list) { + for (const auto& thread : thread_list) { operation_count[thread.operation_type]++; } ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); @@ -5102,7 +5127,7 @@ TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { } ASSERT_OK(env_->GetThreadList(&thread_list)); - for (auto thread : thread_list) { + for (const auto& thread : thread_list) { operation_count[thread.operation_type]++; } @@ -5127,7 +5152,7 @@ TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { operation_count[i] = 0; } ASSERT_OK(env_->GetThreadList(&thread_list)); - for (auto thread : thread_list) { + for (const auto& thread : thread_list) { operation_count[thread.operation_type]++; } ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); @@ -5212,7 +5237,7 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) { })); ColumnFamilyMetaData cf_meta; db_->GetColumnFamilyMetaData(&cf_meta); - for (auto file : cf_meta.levels[4].files) { + for (const auto& file : cf_meta.levels[4].files) { listener->SetExpectedFileName(dbname_ + file.name); ASSERT_OK(dbfull()->DeleteFile(file.name)); } @@ -5270,7 +5295,7 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel2) { std::atomic num_no(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); if (compaction->output_level() == 4) { ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); num_lz4.fetch_add(1); @@ -5278,7 +5303,7 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel2) { }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { - auto* compression = reinterpret_cast(arg); + auto* compression = static_cast(arg); ASSERT_TRUE(*compression == kNoCompression); num_no.fetch_add(1); }); @@ -5312,7 +5337,7 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel2) { num_no.store(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); if (compaction->output_level() == 4 && compaction->start_level() == 3) { ASSERT_TRUE(compaction->output_compression() == kZlibCompression); num_zlib.fetch_add(1); @@ -5323,7 +5348,7 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel2) { }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { - auto* compression = reinterpret_cast(arg); + auto* compression = static_cast(arg); ASSERT_TRUE(*compression == kNoCompression); num_no.fetch_add(1); }); @@ -5710,7 +5735,7 @@ TEST_F(DBTest, FileCreationRandomFailure) { std::vector values; for (int i = 0; i < kTestSize; ++i) { - values.push_back("NOT_FOUND"); + values.emplace_back("NOT_FOUND"); } for (int j = 0; j < kTotalIteration; ++j) { if (j == kRandomFailureTest) { @@ -5836,13 +5861,6 @@ TEST_F(DBTest, DynamicMiscOptions) { ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], &mutable_cf_options)); ASSERT_TRUE(mutable_cf_options.report_bg_io_stats); - ASSERT_TRUE(mutable_cf_options.check_flush_compaction_key_order); - - ASSERT_OK(dbfull()->SetOptions( - handles_[1], {{"check_flush_compaction_key_order", "false"}})); - ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], - &mutable_cf_options)); - ASSERT_FALSE(mutable_cf_options.check_flush_compaction_key_order); } TEST_F(DBTest, L0L1L2AndUpHitCounter) { @@ -6225,7 +6243,7 @@ TEST_F(DBTest, SuggestCompactRangeTest) { return "CompactionFilterFactoryGetContext"; } static bool IsManual(CompactionFilterFactory* compaction_filter_factory) { - return reinterpret_cast( + return static_cast( compaction_filter_factory) ->saved_context.is_manual_compaction; } @@ -7108,9 +7126,8 @@ TEST_F(DBTest, PinnableSliceAndRowCache) { ASSERT_OK(Flush()); ASSERT_EQ(Get("foo"), "bar"); - ASSERT_EQ( - reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), - 1); + ASSERT_EQ(static_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); { PinnableSlice pin_slice; @@ -7118,13 +7135,11 @@ TEST_F(DBTest, PinnableSliceAndRowCache) { ASSERT_EQ(pin_slice.ToString(), "bar"); // Entry is already in cache, lookup will remove the element from lru ASSERT_EQ( - reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), - 0); + static_cast(options.row_cache.get())->TEST_GetLRUSize(), 0); } // After PinnableSlice destruction element is added back in LRU - ASSERT_EQ( - reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), - 1); + ASSERT_EQ(static_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); } TEST_F(DBTest, ReusePinnableSlice) { @@ -7137,9 +7152,8 @@ TEST_F(DBTest, ReusePinnableSlice) { ASSERT_OK(Flush()); ASSERT_EQ(Get("foo"), "bar"); - ASSERT_EQ( - reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), - 1); + ASSERT_EQ(static_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); { PinnableSlice pin_slice; @@ -7149,17 +7163,15 @@ TEST_F(DBTest, ReusePinnableSlice) { // Entry is already in cache, lookup will remove the element from lru ASSERT_EQ( - reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), - 0); + static_cast(options.row_cache.get())->TEST_GetLRUSize(), 0); } // After PinnableSlice destruction element is added back in LRU - ASSERT_EQ( - reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), - 1); + ASSERT_EQ(static_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); { std::vector multiget_keys; - multiget_keys.push_back("foo"); + multiget_keys.emplace_back("foo"); std::vector multiget_values(1); std::vector statuses({Status::NotFound()}); ReadOptions ropt; @@ -7174,19 +7186,17 @@ TEST_F(DBTest, ReusePinnableSlice) { // Entry is already in cache, lookup will remove the element from lru ASSERT_EQ( - reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), - 0); + static_cast(options.row_cache.get())->TEST_GetLRUSize(), 0); } // After PinnableSlice destruction element is added back in LRU - ASSERT_EQ( - reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), - 1); + ASSERT_EQ(static_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); { std::vector multiget_cfs; multiget_cfs.push_back(dbfull()->DefaultColumnFamily()); std::vector multiget_keys; - multiget_keys.push_back("foo"); + multiget_keys.emplace_back("foo"); std::vector multiget_values(1); std::vector statuses({Status::NotFound()}); ReadOptions ropt; @@ -7201,13 +7211,11 @@ TEST_F(DBTest, ReusePinnableSlice) { // Entry is already in cache, lookup will remove the element from lru ASSERT_EQ( - reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), - 0); + static_cast(options.row_cache.get())->TEST_GetLRUSize(), 0); } // After PinnableSlice destruction element is added back in LRU - ASSERT_EQ( - reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), - 1); + ASSERT_EQ(static_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); } @@ -7366,7 +7374,7 @@ TEST_F(DBTest, CreationTimeOfOldestFile) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) { - TableProperties* props = reinterpret_cast(arg); + TableProperties* props = static_cast(arg); if (set_file_creation_time_to_zero) { if (idx == 0) { props->file_creation_time = 0; diff --git a/db/db_test2.cc b/db/db_test2.cc index e471685b210..2da4b563d04 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -80,9 +80,8 @@ TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) { ColumnFamilyOptions cf_options(options); std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); - column_families.push_back(ColumnFamilyDescriptor("goku", cf_options)); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); + column_families.emplace_back("goku", cf_options); std::vector handles; // OpenForReadOnly should fail but will create in the file system ASSERT_NOK( @@ -748,7 +747,7 @@ TEST_F(DBTest2, WalFilterTest) { // we expect all records to be processed for (size_t i = 0; i < batch_keys.size(); i++) { for (size_t j = 0; j < batch_keys[i].size(); j++) { - keys_must_exist.push_back(Slice(batch_keys[i][j])); + keys_must_exist.emplace_back(batch_keys[i][j]); } } break; @@ -762,9 +761,9 @@ TEST_F(DBTest2, WalFilterTest) { for (size_t i = 0; i < batch_keys.size(); i++) { for (size_t j = 0; j < batch_keys[i].size(); j++) { if (i == apply_option_for_record_index) { - keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + keys_must_not_exist.emplace_back(batch_keys[i][j]); } else { - keys_must_exist.push_back(Slice(batch_keys[i][j])); + keys_must_exist.emplace_back(batch_keys[i][j]); } } } @@ -780,9 +779,9 @@ TEST_F(DBTest2, WalFilterTest) { for (size_t i = 0; i < batch_keys.size(); i++) { for (size_t j = 0; j < batch_keys[i].size(); j++) { if (i >= apply_option_for_record_index) { - keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + keys_must_not_exist.emplace_back(batch_keys[i][j]); } else { - keys_must_exist.push_back(Slice(batch_keys[i][j])); + keys_must_exist.emplace_back(batch_keys[i][j]); } } } @@ -922,9 +921,9 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) { for (size_t i = 0; i < batch_keys.size(); i++) { for (size_t j = 0; j < batch_keys[i].size(); j++) { if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) { - keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + keys_must_not_exist.emplace_back(batch_keys[i][j]); } else { - keys_must_exist.push_back(Slice(batch_keys[i][j])); + keys_must_exist.emplace_back(batch_keys[i][j]); } } } @@ -1012,7 +1011,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) { for (size_t i = 0; i < batch_keys.size(); i++) { for (size_t j = 0; j < batch_keys[i].size(); j++) { - keys_must_exist.push_back(Slice(batch_keys[i][j])); + keys_must_exist.emplace_back(batch_keys[i][j]); } } @@ -2331,7 +2330,7 @@ TEST_F(DBTest2, MaxCompactionBytesTest) { } static void UniqueIdCallback(void* arg) { - int* result = reinterpret_cast(arg); + int* result = static_cast(arg); if (*result == -1) { *result = 0; } @@ -2350,7 +2349,7 @@ class MockPersistentCache : public PersistentCache { "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback); } - ~MockPersistentCache() override {} + ~MockPersistentCache() override = default; PersistentCache::StatsType Stats() override { return PersistentCache::StatsType(); @@ -3036,7 +3035,7 @@ TEST_F(DBTest2, PausingManualCompaction1) { // Remember file name before compaction is triggered std::vector files_meta; dbfull()->GetLiveFilesMetaData(&files_meta); - for (auto file : files_meta) { + for (const auto& file : files_meta) { files_before_compact.push_back(file.name); } @@ -3051,7 +3050,7 @@ TEST_F(DBTest2, PausingManualCompaction1) { // Get file names after compaction is stopped files_meta.clear(); dbfull()->GetLiveFilesMetaData(&files_meta); - for (auto file : files_meta) { + for (const auto& file : files_meta) { files_after_compact.push_back(file.name); } @@ -3071,7 +3070,7 @@ TEST_F(DBTest2, PausingManualCompaction1) { files_meta.clear(); files_after_compact.clear(); dbfull()->GetLiveFilesMetaData(&files_meta); - for (auto file : files_meta) { + for (const auto& file : files_meta) { files_after_compact.push_back(file.name); } @@ -3861,6 +3860,17 @@ TEST_F(DBTest2, LowPriWrite) { int64_t* rate_bytes_per_sec = static_cast(arg); ASSERT_EQ(1024 * 1024, *rate_bytes_per_sec); }); + + // Make a trivial L5 for L0 to compact into. L6 will be large so debt ratio + // will not cause compaction pressure. + Random rnd(301); + ASSERT_OK(Put("", rnd.RandomString(102400))); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + // Block compaction ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ {"DBTest.LowPriWrite:0", "DBImpl::BGWorkCompaction"}, @@ -3937,6 +3947,15 @@ TEST_F(DBTest2, RateLimitedCompactionReads) { options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); + // To precisely control when to start bg compaction for excluding previous + // rate-limited bytes of flush read for table verification + std::shared_ptr sleeping_task( + new test::SleepingBackgroundTask()); + env_->SetBackgroundThreads(1, Env::LOW); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + sleeping_task.get(), Env::Priority::LOW); + sleeping_task->WaitUntilSleeping(); + for (int i = 0; i < kNumL0Files; ++i) { for (int j = 0; j <= kNumKeysPerFile; ++j) { ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey))); @@ -3946,13 +3965,20 @@ TEST_F(DBTest2, RateLimitedCompactionReads) { ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); } } + + size_t rate_limited_bytes_start_bytes = + options.rate_limiter->GetTotalBytesThrough(Env::IO_TOTAL); + + sleeping_task->WakeUp(); + sleeping_task->WaitUntilDone(); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); - // should be slightly above 512KB due to non-data blocks read. Arbitrarily // chose 1MB as the upper bound on the total bytes read. - size_t rate_limited_bytes = static_cast( - options.rate_limiter->GetTotalBytesThrough(Env::IO_TOTAL)); + size_t rate_limited_bytes = + static_cast( + options.rate_limiter->GetTotalBytesThrough(Env::IO_TOTAL)) - + rate_limited_bytes_start_bytes; // The charges can exist for `IO_LOW` and `IO_USER` priorities. size_t rate_limited_bytes_by_pri = options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) + @@ -4135,7 +4161,7 @@ TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) { ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber()))); } - ASSERT_OK(db_->EnableFileDeletions(/*force=*/false)); + ASSERT_OK(db_->EnableFileDeletions()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -4213,10 +4239,10 @@ TEST_F(DBTest2, TestNumPread) { class TraceExecutionResultHandler : public TraceRecordResult::Handler { public: - TraceExecutionResultHandler() {} - ~TraceExecutionResultHandler() override {} + TraceExecutionResultHandler() = default; + ~TraceExecutionResultHandler() override = default; - virtual Status Handle(const StatusOnlyTraceExecutionResult& result) override { + Status Handle(const StatusOnlyTraceExecutionResult& result) override { if (result.GetStartTimestamp() > result.GetEndTimestamp()) { return Status::InvalidArgument("Invalid timestamps."); } @@ -4234,8 +4260,7 @@ class TraceExecutionResultHandler : public TraceRecordResult::Handler { return Status::OK(); } - virtual Status Handle( - const SingleValueTraceExecutionResult& result) override { + Status Handle(const SingleValueTraceExecutionResult& result) override { if (result.GetStartTimestamp() > result.GetEndTimestamp()) { return Status::InvalidArgument("Invalid timestamps."); } @@ -4253,8 +4278,7 @@ class TraceExecutionResultHandler : public TraceRecordResult::Handler { return Status::OK(); } - virtual Status Handle( - const MultiValuesTraceExecutionResult& result) override { + Status Handle(const MultiValuesTraceExecutionResult& result) override { if (result.GetStartTimestamp() > result.GetEndTimestamp()) { return Status::InvalidArgument("Invalid timestamps."); } @@ -4274,7 +4298,7 @@ class TraceExecutionResultHandler : public TraceRecordResult::Handler { return Status::OK(); } - virtual Status Handle(const IteratorTraceExecutionResult& result) override { + Status Handle(const IteratorTraceExecutionResult& result) override { if (result.GetStartTimestamp() > result.GetEndTimestamp()) { return Status::InvalidArgument("Invalid timestamps."); } @@ -4399,9 +4423,8 @@ TEST_F(DBTest2, TraceAndReplay) { std::vector column_families; ColumnFamilyOptions cf_options; cf_options.merge_operator = MergeOperators::CreatePutOperator(); - column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); - column_families.push_back( - ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + column_families.emplace_back("default", cf_options); + column_families.emplace_back("pikachu", ColumnFamilyOptions()); std::vector handles; DBOptions db_opts; db_opts.env = env_; @@ -4591,9 +4614,8 @@ TEST_F(DBTest2, TraceAndManualReplay) { std::vector column_families; ColumnFamilyOptions cf_options; cf_options.merge_operator = MergeOperators::CreatePutOperator(); - column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); - column_families.push_back( - ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + column_families.emplace_back("default", cf_options); + column_families.emplace_back("pikachu", ColumnFamilyOptions()); std::vector handles; DBOptions db_opts; db_opts.env = env_; @@ -4868,9 +4890,8 @@ TEST_F(DBTest2, TraceWithLimit) { std::vector column_families; ColumnFamilyOptions cf_options; cf_options.merge_operator = MergeOperators::CreatePutOperator(); - column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); - column_families.push_back( - ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + column_families.emplace_back("default", cf_options); + column_families.emplace_back("pikachu", ColumnFamilyOptions()); std::vector handles; DBOptions db_opts; db_opts.env = env_; @@ -4942,9 +4963,8 @@ TEST_F(DBTest2, TraceWithSampling) { DB* db2 = nullptr; std::vector column_families; ColumnFamilyOptions cf_options; - column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); - column_families.push_back( - ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + column_families.emplace_back("default", cf_options); + column_families.emplace_back("pikachu", ColumnFamilyOptions()); std::vector handles; DBOptions db_opts; db_opts.env = env_; @@ -5048,9 +5068,8 @@ TEST_F(DBTest2, TraceWithFilter) { std::vector column_families; ColumnFamilyOptions cf_options; cf_options.merge_operator = MergeOperators::CreatePutOperator(); - column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); - column_families.push_back( - ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + column_families.emplace_back("default", cf_options); + column_families.emplace_back("pikachu", ColumnFamilyOptions()); std::vector handles; DBOptions db_opts; db_opts.env = env_; @@ -5098,9 +5117,8 @@ TEST_F(DBTest2, TraceWithFilter) { delete db3_init; column_families.clear(); - column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); - column_families.push_back( - ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + column_families.emplace_back("default", cf_options); + column_families.emplace_back("pikachu", ColumnFamilyOptions()); handles.clear(); DB* db3 = nullptr; @@ -5691,7 +5709,7 @@ TEST_F(DBTest2, CrashInRecoveryMultipleCF) { ASSERT_OK(ReadFileToString(env_, fname, &file_content)); file_content[400] = 'h'; file_content[401] = 'a'; - ASSERT_OK(WriteStringToFile(env_, file_content, fname)); + ASSERT_OK(WriteStringToFile(env_, file_content, fname, false)); break; } } @@ -6441,7 +6459,7 @@ class RenameCurrentTest : public DBTestBase, : DBTestBase("rename_current_test", /*env_do_fsync=*/true), sync_point_(GetParam()) {} - ~RenameCurrentTest() override {} + ~RenameCurrentTest() override = default; void SetUp() override { env_->no_file_overwrite_.store(true, std::memory_order_release); @@ -6454,7 +6472,7 @@ class RenameCurrentTest : public DBTestBase, void SetupSyncPoints() { SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) { - Status* s = reinterpret_cast(arg); + Status* s = static_cast(arg); assert(s); *s = Status::IOError("Injected IO error."); }); @@ -6586,7 +6604,7 @@ TEST_F(DBTest2, LastLevelTemperature) { auto* listener = new TestListener(); Options options = CurrentOptions(); - options.bottommost_temperature = Temperature::kWarm; + options.last_level_temperature = Temperature::kWarm; options.level0_file_num_compaction_trigger = 2; options.level_compaction_dynamic_level_bytes = true; options.num_levels = kNumLevels; @@ -6798,8 +6816,8 @@ TEST_F(DBTest2, LastLevelTemperatureUniversal) { size = GetSstSizeHelper(Temperature::kWarm); ASSERT_EQ(size, 0); - // Update bottommost temperature - options.bottommost_temperature = Temperature::kWarm; + // Update last level temperature + options.last_level_temperature = Temperature::kWarm; Reopen(options); db_->GetColumnFamilyMetaData(&metadata); // Should not impact existing ones @@ -6851,10 +6869,10 @@ TEST_F(DBTest2, LastLevelTemperatureUniversal) { &prop)); ASSERT_EQ(std::atoi(prop.c_str()), 0); - // Update bottommost temperature dynamically with SetOptions + // Update last level temperature dynamically with SetOptions auto s = db_->SetOptions({{"last_level_temperature", "kCold"}}); ASSERT_OK(s); - ASSERT_EQ(db_->GetOptions().bottommost_temperature, Temperature::kCold); + ASSERT_EQ(db_->GetOptions().last_level_temperature, Temperature::kCold); db_->GetColumnFamilyMetaData(&metadata); // Should not impact the existing files ASSERT_EQ(Temperature::kWarm, @@ -6880,94 +6898,133 @@ TEST_F(DBTest2, LastLevelTemperatureUniversal) { ASSERT_GT(size, 0); // kLastTemperature is an invalid temperature - options.bottommost_temperature = Temperature::kLastTemperature; + options.last_level_temperature = Temperature::kLastTemperature; s = TryReopen(options); ASSERT_TRUE(s.IsIOError()); } TEST_F(DBTest2, LastLevelStatistics) { - Options options = CurrentOptions(); - options.bottommost_temperature = Temperature::kWarm; - options.default_temperature = Temperature::kHot; - options.level0_file_num_compaction_trigger = 2; - options.level_compaction_dynamic_level_bytes = true; - options.statistics = CreateDBStatistics(); - Reopen(options); - - // generate 1 sst on level 0 - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("bar", "bar")); - ASSERT_OK(Flush()); - ASSERT_EQ("bar", Get("bar")); - - ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), 0); - ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), 0); - ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), - options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); - ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), - options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); - ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), 0); - ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), 0); - - // 2nd flush to trigger compaction - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("bar", "bar")); - ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ASSERT_EQ("bar", Get("bar")); - - ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), - options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); - ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), - options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); - ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), - options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); - ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), - options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); - - auto pre_bytes = - options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES); - auto pre_count = - options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); - - // 3rd flush to generate 1 sst on level 0 - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("bar", "bar")); - ASSERT_OK(Flush()); - ASSERT_EQ("bar", Get("bar")); + for (bool write_time_default : {false, true}) { + SCOPED_TRACE("write time default? " + std::to_string(write_time_default)); + Options options = CurrentOptions(); + options.last_level_temperature = Temperature::kWarm; + if (write_time_default) { + options.default_write_temperature = Temperature::kHot; + ASSERT_EQ(options.default_temperature, Temperature::kUnknown); + } else { + options.default_temperature = Temperature::kHot; + ASSERT_EQ(options.default_write_temperature, Temperature::kUnknown); + } + options.level0_file_num_compaction_trigger = 2; + options.level_compaction_dynamic_level_bytes = true; + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), - pre_bytes); - ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), - pre_count); - ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), - options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); - ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), - options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); - ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), - options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); - ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), - options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); - - // Not a realistic setting to make last level kWarm and default temp kCold. - // This is just for testing default temp can be reset on reopen while the - // last level temp is consistent across DB reopen because those file's temp - // are persisted in manifest. - options.default_temperature = Temperature::kCold; - ASSERT_OK(options.statistics->Reset()); - Reopen(options); - ASSERT_EQ("bar", Get("bar")); + DestroyAndReopen(options); - ASSERT_EQ(0, options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); + // generate 1 sst on level 0 + ASSERT_OK(Put("foo1", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("bar")); + + ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), 0); + ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), 0); + + // 2nd flush to trigger compaction + ASSERT_OK(Put("foo2", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("bar", Get("bar")); + + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); + + auto pre_bytes = + options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES); + auto pre_count = + options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); + + // 3rd flush to generate 1 sst on level 0 + ASSERT_OK(Put("foo3", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("foo1")); + ASSERT_EQ("bar", Get("foo2")); + ASSERT_EQ("bar", Get("foo3")); + ASSERT_EQ("bar", Get("bar")); + + ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), + pre_bytes); + ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), + pre_count); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); + // Control + ASSERT_NE(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT)); + + // Not a realistic setting to make last level kWarm and default temp kCold. + // This is just for testing default temp can be reset on reopen while the + // last level temp is consistent across DB reopen because those file's temp + // are persisted in manifest. + options.default_temperature = Temperature::kCold; + ASSERT_OK(options.statistics->Reset()); + Reopen(options); + ASSERT_EQ("bar", Get("foo1")); + ASSERT_EQ("bar", Get("foo2")); + ASSERT_EQ("bar", Get("foo3")); + ASSERT_EQ("bar", Get("bar")); + + if (write_time_default) { + // Unchanged + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); + + ASSERT_LT(0, options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); + ASSERT_EQ(0, options.statistics->getTickerCount(COLD_FILE_READ_BYTES)); + } else { + // Changed (in how we map kUnknown) + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(COLD_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(COLD_FILE_READ_COUNT)); + + ASSERT_EQ(0, options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); + ASSERT_LT(0, options.statistics->getTickerCount(COLD_FILE_READ_BYTES)); + } - ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), - options.statistics->getTickerCount(COLD_FILE_READ_BYTES)); - ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), - options.statistics->getTickerCount(COLD_FILE_READ_COUNT)); - ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), - options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); - ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), - options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); + // Control + ASSERT_NE(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT)); + } } TEST_F(DBTest2, CheckpointFileTemperature) { @@ -6984,7 +7041,7 @@ TEST_F(DBTest2, CheckpointFileTemperature) { auto test_fs = std::make_shared(env_->GetFileSystem()); std::unique_ptr env(new CompositeEnvWrapper(env_, test_fs)); Options options = CurrentOptions(); - options.bottommost_temperature = Temperature::kWarm; + options.last_level_temperature = Temperature::kWarm; // set dynamic_level to true so the compaction would compact the data to the // last level directly which will have the last_level_temperature options.level_compaction_dynamic_level_bytes = true; @@ -7010,7 +7067,7 @@ TEST_F(DBTest2, CheckpointFileTemperature) { std::vector infos; ASSERT_OK( dbfull()->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), &infos)); - for (auto info : infos) { + for (const auto& info : infos) { temperatures.emplace(info.file_number, info.temperature); } @@ -7043,7 +7100,7 @@ TEST_F(DBTest2, FileTemperatureManifestFixup) { auto test_fs = std::make_shared(env_->GetFileSystem()); std::unique_ptr env(new CompositeEnvWrapper(env_, test_fs)); Options options = CurrentOptions(); - options.bottommost_temperature = Temperature::kWarm; + options.last_level_temperature = Temperature::kWarm; // set dynamic_level to true so the compaction would compact the data to the // last level directly which will have the last_level_temperature options.level_compaction_dynamic_level_bytes = true; @@ -7156,7 +7213,7 @@ TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) { "LogReader::ReadMore:AfterReadFile", [&](void* arg) { if (should_inject_error) { ASSERT_NE(nullptr, arg); - *reinterpret_cast(arg) = Status::IOError("Injected IOError"); + *static_cast(arg) = Status::IOError("Injected IOError"); } }); SyncPoint::GetInstance()->EnableProcessing(); @@ -7751,6 +7808,27 @@ TEST_F(DBTest2, ZSTDChecksum) { } #endif +TEST_F(DBTest2, TableCacheMissDuringReadFromBlockCacheTier) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + Reopen(options); + + // Give table cache zero capacity to prevent preloading tables. That way, + // `kBlockCacheTier` reads will fail due to table cache misses. + dbfull()->TEST_table_cache()->SetCapacity(0); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + uint64_t orig_num_file_opens = TestGetTickerCount(options, NO_FILE_OPENS); + + ReadOptions non_blocking_opts; + non_blocking_opts.read_tier = kBlockCacheTier; + std::string value; + ASSERT_TRUE(db_->Get(non_blocking_opts, "foo", &value).IsIncomplete()); + + ASSERT_EQ(orig_num_file_opens, TestGetTickerCount(options, NO_FILE_OPENS)); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 3fb45767630..cbc394b0426 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -73,7 +73,7 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync) if (getenv("ENCRYPTED_ENV")) { std::shared_ptr provider; std::string provider_id = getenv("ENCRYPTED_ENV"); - if (provider_id.find("=") == std::string::npos && + if (provider_id.find('=') == std::string::npos && !EndsWith(provider_id, "://test")) { provider_id = provider_id + "://test"; } @@ -588,7 +588,7 @@ void DBTestBase::CreateColumnFamilies(const std::vector& cfs, ColumnFamilyOptions cf_opts(options); size_t cfi = handles_.size(); handles_.resize(cfi + cfs.size()); - for (auto cf : cfs) { + for (const auto& cf : cfs) { Status s = db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]); ASSERT_OK(s); } @@ -651,7 +651,7 @@ Status DBTestBase::TryReopenWithColumnFamilies( EXPECT_EQ(cfs.size(), options.size()); std::vector column_families; for (size_t i = 0; i < cfs.size(); ++i) { - column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i])); + column_families.emplace_back(cfs[i], options[i]); } DBOptions db_opts = DBOptions(options[0]); last_options_ = options[0]; @@ -759,6 +759,24 @@ Status DBTestBase::Put(int cf, const Slice& k, const Slice& v, } } +Status DBTestBase::TimedPut(const Slice& k, const Slice& v, + uint64_t write_unix_time, WriteOptions wo) { + return TimedPut(0, k, v, write_unix_time, wo); +} + +Status DBTestBase::TimedPut(int cf, const Slice& k, const Slice& v, + uint64_t write_unix_time, WriteOptions wo) { + WriteBatch wb; + ColumnFamilyHandle* cfh; + if (cf != 0) { + cfh = handles_[cf]; + } else { + cfh = db_->DefaultColumnFamily(); + } + EXPECT_OK(wb.TimedPut(cfh, k, v, write_unix_time)); + return db_->Write(wo, &wb); +} + Status DBTestBase::Merge(const Slice& k, const Slice& v, WriteOptions wo) { return db_->Merge(wo, k, v); } @@ -828,7 +846,7 @@ std::vector DBTestBase::MultiGet(std::vector cfs, for (unsigned int i = 0; i < cfs.size(); ++i) { handles.push_back(handles_[cfs[i]]); - keys.push_back(k[i]); + keys.emplace_back(k[i]); } std::vector s; if (!batched) { @@ -875,7 +893,7 @@ std::vector DBTestBase::MultiGet(const std::vector& k, std::vector pin_values(k.size()); for (size_t i = 0; i < k.size(); ++i) { - keys.push_back(k[i]); + keys.emplace_back(k[i]); } db_->MultiGet(options, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), pin_values.data(), statuses.data()); @@ -974,13 +992,13 @@ std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) { auto options = CurrentOptions(); InternalKeyComparator icmp(options.comparator); ReadOptions read_options; - ScopedArenaIterator iter; + ScopedArenaPtr iter; if (cf == 0) { - iter.set(dbfull()->NewInternalIterator(read_options, &arena, - kMaxSequenceNumber)); + iter.reset(dbfull()->NewInternalIterator(read_options, &arena, + kMaxSequenceNumber)); } else { - iter.set(dbfull()->NewInternalIterator(read_options, &arena, - kMaxSequenceNumber, handles_[cf])); + iter.reset(dbfull()->NewInternalIterator(read_options, &arena, + kMaxSequenceNumber, handles_[cf])); } InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); iter->Seek(target.Encode()); @@ -1152,7 +1170,7 @@ int DBTestBase::TotalTableFiles(int cf, int levels) { // Return spread of files per level std::string DBTestBase::FilesPerLevel(int cf) { int num_levels = - (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]); + (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[cf]); std::string result; size_t last_non_zero_offset = 0; for (int level = 0; level < num_levels; level++) { @@ -1453,13 +1471,13 @@ void DBTestBase::validateNumberOfEntries(int numValues, int cf) { auto options = CurrentOptions(); InternalKeyComparator icmp(options.comparator); ReadOptions read_options; - ScopedArenaIterator iter; + ScopedArenaPtr iter; if (cf != 0) { - iter.set(dbfull()->NewInternalIterator(read_options, &arena, - kMaxSequenceNumber, handles_[cf])); + iter.reset(dbfull()->NewInternalIterator(read_options, &arena, + kMaxSequenceNumber, handles_[cf])); } else { - iter.set(dbfull()->NewInternalIterator(read_options, &arena, - kMaxSequenceNumber)); + iter.reset(dbfull()->NewInternalIterator(read_options, &arena, + kMaxSequenceNumber)); } iter->SeekToFirst(); ASSERT_OK(iter->status()); @@ -1614,7 +1632,7 @@ void DBTestBase::VerifyDBFromMap(std::map true_data, << iter_cnt << " / " << true_data.size(); // Verify Iterator::Seek() - for (auto kv : true_data) { + for (const auto& kv : true_data) { iter->Seek(kv.first); ASSERT_EQ(kv.first, iter->key().ToString()); ASSERT_EQ(kv.second, iter->value().ToString()); @@ -1644,7 +1662,7 @@ void DBTestBase::VerifyDBFromMap(std::map true_data, << iter_cnt << " / " << true_data.size(); // Verify ForwardIterator::Seek() - for (auto kv : true_data) { + for (const auto& kv : true_data) { iter->Seek(kv.first); ASSERT_EQ(kv.first, iter->key().ToString()); ASSERT_EQ(kv.second, iter->value().ToString()); @@ -1667,7 +1685,7 @@ void DBTestBase::VerifyDBInternal( auto iter = dbfull()->NewInternalIterator(read_options, &arena, kMaxSequenceNumber); iter->SeekToFirst(); - for (auto p : true_data) { + for (const auto& p : true_data) { ASSERT_TRUE(iter->Valid()); ParsedInternalKey ikey; ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); diff --git a/db/db_test_util.h b/db/db_test_util.h index 023784f6152..775c161d36b 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -24,6 +24,7 @@ #include "db/db_impl/db_impl.h" #include "file/filename.h" +#include "options/options_helper.h" #include "rocksdb/advanced_options.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" @@ -40,7 +41,6 @@ #include "rocksdb/table.h" #include "rocksdb/utilities/checkpoint.h" #include "table/mock_table.h" -#include "table/scoped_arena_iterator.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "util/cast_util.h" @@ -438,15 +438,15 @@ class SpecialEnv : public EnvWrapper { : target_(std::move(target)), counter_(counter), bytes_read_(bytes_read) {} - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { counter_->Increment(); Status s = target_->Read(offset, n, result, scratch); *bytes_read_ += result->size(); return s; } - virtual Status Prefetch(uint64_t offset, size_t n) override { + Status Prefetch(uint64_t offset, size_t n) override { Status s = target_->Prefetch(offset, n); *bytes_read_ += n; return s; @@ -465,8 +465,8 @@ class SpecialEnv : public EnvWrapper { : target_(std::move(target)), fail_cnt_(failure_cnt), fail_odd_(fail_odd) {} - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { if (Random::GetTLSInstance()->OneIn(fail_odd_)) { fail_cnt_->fetch_add(1); return Status::IOError("random error"); @@ -474,7 +474,7 @@ class SpecialEnv : public EnvWrapper { return target_->Read(offset, n, result, scratch); } - virtual Status Prefetch(uint64_t offset, size_t n) override { + Status Prefetch(uint64_t offset, size_t n) override { return target_->Prefetch(offset, n); } @@ -502,19 +502,19 @@ class SpecialEnv : public EnvWrapper { return s; } - virtual Status NewSequentialFile(const std::string& f, - std::unique_ptr* r, - const EnvOptions& soptions) override { + Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& soptions) override { class CountingFile : public SequentialFile { public: CountingFile(std::unique_ptr&& target, anon::AtomicCounter* counter) : target_(std::move(target)), counter_(counter) {} - virtual Status Read(size_t n, Slice* result, char* scratch) override { + Status Read(size_t n, Slice* result, char* scratch) override { counter_->Increment(); return target_->Read(n, result, scratch); } - virtual Status Skip(uint64_t n) override { return target_->Skip(n); } + Status Skip(uint64_t n) override { return target_->Skip(n); } private: std::unique_ptr target_; @@ -528,7 +528,7 @@ class SpecialEnv : public EnvWrapper { return s; } - virtual void SleepForMicroseconds(int micros) override { + void SleepForMicroseconds(int micros) override { sleep_counter_.Increment(); if (no_slowdown_ || time_elapse_only_sleep_) { addon_microseconds_.fetch_add(micros); @@ -550,7 +550,7 @@ class SpecialEnv : public EnvWrapper { addon_microseconds_.fetch_add(seconds * 1000000); } - virtual Status GetCurrentTime(int64_t* unix_time) override { + Status GetCurrentTime(int64_t* unix_time) override { Status s; if (time_elapse_only_sleep_) { *unix_time = maybe_starting_time_; @@ -564,22 +564,22 @@ class SpecialEnv : public EnvWrapper { return s; } - virtual uint64_t NowCPUNanos() override { + uint64_t NowCPUNanos() override { now_cpu_count_.fetch_add(1); return target()->NowCPUNanos(); } - virtual uint64_t NowNanos() override { + uint64_t NowNanos() override { return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) + addon_microseconds_.load() * 1000; } - virtual uint64_t NowMicros() override { + uint64_t NowMicros() override { return (time_elapse_only_sleep_ ? 0 : target()->NowMicros()) + addon_microseconds_.load(); } - virtual Status DeleteFile(const std::string& fname) override { + Status DeleteFile(const std::string& fname) override { delete_count_.fetch_add(1); return target()->DeleteFile(fname); } @@ -729,7 +729,11 @@ class FileTemperatureTestFS : public FileSystemWrapper { if (e != current_sst_file_temperatures_.end() && e->second != opts.temperature) { result->reset(); - return IOStatus::PathNotFound("Temperature mismatch on " + fname); + return IOStatus::PathNotFound( + "Read requested temperature " + + temperature_to_string[opts.temperature] + + " but stored with temperature " + + temperature_to_string[e->second] + " for " + fname); } } *result = WrapWithTemperature( @@ -758,7 +762,11 @@ class FileTemperatureTestFS : public FileSystemWrapper { if (e != current_sst_file_temperatures_.end() && e->second != opts.temperature) { result->reset(); - return IOStatus::PathNotFound("Temperature mismatch on " + fname); + return IOStatus::PathNotFound( + "Read requested temperature " + + temperature_to_string[opts.temperature] + + " but stored with temperature " + + temperature_to_string[e->second] + " for " + fname); } } *result = WrapWithTemperature( @@ -792,11 +800,37 @@ class FileTemperatureTestFS : public FileSystemWrapper { return target()->NewWritableFile(fname, opts, result, dbg); } + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override { + IOStatus ios = target()->DeleteFile(fname, options, dbg); + if (ios.ok()) { + uint64_t number; + FileType type; + if (ParseFileName(GetFileName(fname), &number, &type) && + type == kTableFile) { + MutexLock lock(&mu_); + current_sst_file_temperatures_.erase(number); + } + } + return ios; + } + void CopyCurrentSstFileTemperatures(std::map* out) { MutexLock lock(&mu_); *out = current_sst_file_temperatures_; } + size_t CountCurrentSstFilesWithTemperature(Temperature temp) { + MutexLock lock(&mu_); + size_t count = 0; + for (const auto& e : current_sst_file_temperatures_) { + if (e.second == temp) { + ++count; + } + } + return count; + } + void OverrideSstFileTemperature(uint64_t number, Temperature temp) { MutexLock lock(&mu_); current_sst_file_temperatures_[number] = temp; @@ -882,8 +916,8 @@ class FlushCounterListener : public EventListener { // "corrupted", "corrupted_try_merge", or "corrupted_must_merge". class TestPutOperator : public MergeOperator { public: - virtual bool FullMergeV2(const MergeOperationInput& merge_in, - MergeOperationOutput* merge_out) const override { + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { static const std::map bad_operand_to_op_failure_scope = { {"corrupted", MergeOperator::OpFailureScope::kDefault}, @@ -914,7 +948,7 @@ class TestPutOperator : public MergeOperator { return true; } - virtual const char* Name() const override { return "TestPutOperator"; } + const char* Name() const override { return "TestPutOperator"; } }; /* @@ -1141,6 +1175,12 @@ class DBTestBase : public testing::Test { Status Put(int cf, const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()); + Status TimedPut(const Slice& k, const Slice& v, uint64_t write_unix_time, + WriteOptions wo = WriteOptions()); + + Status TimedPut(int cf, const Slice& k, const Slice& v, + uint64_t write_unix_time, WriteOptions wo = WriteOptions()); + Status Merge(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()); diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc index 5c10cdaacf4..b6a716356df 100644 --- a/db/db_universal_compaction_test.cc +++ b/db/db_universal_compaction_test.cc @@ -558,7 +558,7 @@ TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) { ColumnFamilyMetaData cf_meta; dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); std::vector compaction_input_file_names; - for (auto file : cf_meta.levels[0].files) { + for (const auto& file : cf_meta.levels[0].files) { if (rnd.OneIn(2)) { compaction_input_file_names.push_back(file.name); } @@ -2187,7 +2187,7 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "UniversalCompactionPicker::PickPeriodicCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); ASSERT_TRUE(arg != nullptr); ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kPeriodicCompaction); @@ -2258,7 +2258,7 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompactionOffpeak) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "UniversalCompactionPicker::PickPeriodicCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); + Compaction* compaction = static_cast(arg); ASSERT_TRUE(arg != nullptr); ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kPeriodicCompaction); diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index fbc01131e50..91070e298b6 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -107,9 +107,9 @@ class EnrichedSpecialEnv : public SpecialEnv { InstrumentedMutex env_mutex_; // the wal whose actual delete was skipped by the env - std::string skipped_wal = ""; + std::string skipped_wal; // the largest WAL that was requested to be deleted - std::string largest_deleted_wal = ""; + std::string largest_deleted_wal; // number of WALs that were successfully deleted std::atomic deleted_wal_cnt = {0}; // the WAL whose delete from fs was skipped is reopened during recovery @@ -1123,15 +1123,13 @@ TEST_F(DBWALTest, PreallocateBlock) { } #endif // !(defined NDEBUG) || !defined(OS_WIN) -TEST_F(DBWALTest, DISABLED_FullPurgePreservesRecycledLog) { - // TODO(ajkr): Disabled until WAL recycling is fixed for - // `kPointInTimeRecovery`. - +TEST_F(DBWALTest, FullPurgePreservesRecycledLog) { // For github issue #1303 for (int i = 0; i < 2; ++i) { Options options = CurrentOptions(); options.create_if_missing = true; options.recycle_log_file_num = 2; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; if (i != 0) { options.wal_dir = alternative_wal_dir_; } @@ -1162,16 +1160,14 @@ TEST_F(DBWALTest, DISABLED_FullPurgePreservesRecycledLog) { } } -TEST_F(DBWALTest, DISABLED_FullPurgePreservesLogPendingReuse) { - // TODO(ajkr): Disabled until WAL recycling is fixed for - // `kPointInTimeRecovery`. - +TEST_F(DBWALTest, FullPurgePreservesLogPendingReuse) { // Ensures full purge cannot delete a WAL while it's in the process of being // recycled. In particular, we force the full purge after a file has been // chosen for reuse, but before it has been renamed. for (int i = 0; i < 2; ++i) { Options options = CurrentOptions(); options.recycle_log_file_num = 1; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; if (i != 0) { options.wal_dir = alternative_wal_dir_; } @@ -1195,7 +1191,7 @@ TEST_F(DBWALTest, DISABLED_FullPurgePreservesLogPendingReuse) { ROCKSDB_NAMESPACE::port::Thread thread([&]() { TEST_SYNC_POINT( "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge"); - ASSERT_OK(db_->EnableFileDeletions(/*force=*/true)); + ASSERT_OK(db_->EnableFileDeletions()); TEST_SYNC_POINT( "DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge"); }); @@ -1543,7 +1539,7 @@ class RecoveryTestHelper { /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", options.daily_offpeak_time_utc, - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); wal_manager.reset( new WalManager(db_options, file_options, /*io_tracer=*/nullptr)); @@ -1561,7 +1557,7 @@ class RecoveryTestHelper { new log::Writer(std::move(file_writer), current_log_number, db_options.recycle_log_file_num > 0, false, db_options.wal_compression); - ASSERT_OK(log_writer->AddCompressionTypeRecord()); + ASSERT_OK(log_writer->AddCompressionTypeRecord(WriteOptions())); current_log_writer.reset(log_writer); WriteBatch batch; @@ -1574,7 +1570,7 @@ class RecoveryTestHelper { ASSERT_OK(batch.Put(key, value)); WriteBatchInternal::SetSequence(&batch, seq); ASSERT_OK(current_log_writer->AddRecord( - WriteBatchInternal::Contents(&batch))); + WriteOptions(), WriteBatchInternal::Contents(&batch))); versions->SetLastAllocatedSequence(seq); versions->SetLastPublishedSequence(seq); versions->SetLastSequence(seq); @@ -2227,8 +2223,7 @@ TEST_P(DBWALTestWithParamsVaryingRecoveryMode, ReadOptions ropt; Iterator* iter = dbfull()->NewIterator(ropt); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - data.push_back( - std::make_pair(iter->key().ToString(), iter->value().ToString())); + data.emplace_back(iter->key().ToString(), iter->value().ToString()); } EXPECT_OK(iter->status()); delete iter; @@ -2434,7 +2429,7 @@ TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) { std::string last_log; uint64_t last_log_num = 0; ASSERT_OK(env_->GetChildren(dbname_, &filenames)); - for (auto fname : filenames) { + for (const auto& fname : filenames) { uint64_t number; FileType type; if (ParseFileName(fname, &number, &type, nullptr)) { @@ -2452,7 +2447,7 @@ TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) { "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "PosixWritableFile::Close", - [](void* arg) { *(reinterpret_cast(arg)) = 0; }); + [](void* arg) { *(static_cast(arg)) = 0; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); // Preallocate space for the empty log file. This could happen if WAL data // was buffered in memory and the process crashed. @@ -2496,7 +2491,7 @@ TEST_F(DBWALTest, ReadOnlyRecoveryNoTruncate) { SyncPoint::GetInstance()->SetCallBack( "PosixWritableFile::Close", [&](void* arg) { if (!enable_truncate) { - *(reinterpret_cast(arg)) = 0; + *(static_cast(arg)) = 0; } }); SyncPoint::GetInstance()->EnableProcessing(); diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc index 4bd8eaa0bfb..2d5b08832ab 100644 --- a/db/db_with_timestamp_basic_test.cc +++ b/db/db_with_timestamp_basic_test.cc @@ -375,7 +375,7 @@ TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLowWithPublicAPI) { ts_low_str_long); ASSERT_EQ(s, Status::InvalidArgument()); // test IncreaseFullHistoryTsLow with a timestamp which is null - std::string ts_low_str_null = ""; + std::string ts_low_str_null; s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str_null); ASSERT_EQ(s, Status::InvalidArgument()); @@ -430,8 +430,8 @@ TEST_F(DBBasicTestWithTimestamp, GetApproximateSizes) { std::vector ranges; std::string start_tmp = Key(10); std::string end_tmp = Key(20); - ranges.emplace_back(Range(start_tmp, end_tmp)); - ranges.emplace_back(Range(start, end)); + ranges.emplace_back(start_tmp, end_tmp); + ranges.emplace_back(start, end); uint64_t range_sizes[2]; ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, ranges.data(), 2, range_sizes)); @@ -598,8 +598,7 @@ TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) { ColumnFamilyOptions cf_options(options); std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); DBOptions db_options(options); // Trim data whose version > Timestamp(5, 0), read(k1, ts(7)) <- NOT_FOUND. @@ -642,8 +641,7 @@ TEST_F(DBBasicTestWithTimestamp, OpenAndTrimHistoryInvalidOptionTest) { ColumnFamilyOptions cf_options(options); std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); DBOptions db_options(options); // OpenAndTrimHistory should not work with avoid_flush_during_recovery @@ -1646,35 +1644,88 @@ TEST_F(DBBasicTestWithTimestamp, GetWithRowCache) { const Snapshot* snap_with_nothing = db_->GetSnapshot(); ASSERT_OK(db_->Put(write_opts, "foo", ts_early, "bar")); - const Snapshot* snap_with_foo = db_->GetSnapshot(); + ASSERT_OK(db_->Put(write_opts, "foo2", ts_early, "bar2")); + ASSERT_OK(db_->Put(write_opts, "foo3", ts_early, "bar3")); - // Ensure file has sequence number greater than snapshot_with_foo - for (int i = 0; i < 10; i++) { - std::string numStr = std::to_string(i); - ASSERT_OK(db_->Put(write_opts, numStr, ts_later, numStr)); - } + const Snapshot* snap_with_foo = db_->GetSnapshot(); ASSERT_OK(Flush()); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); ReadOptions read_opts; read_opts.timestamp = &ts_later_slice; std::string read_value; std::string read_ts; - Status s = db_->Get(read_opts, "foo", &read_value, &read_ts); - ASSERT_OK(s); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); - ASSERT_EQ(read_ts, ts_early); + Status s; - s = db_->Get(read_opts, "foo", &read_value, &read_ts); - ASSERT_OK(s); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); - // Row cache is not storing the ts when record is inserted/updated. - // To be fixed after enabling ROW_CACHE with timestamp. - // ASSERT_EQ(read_ts, ts_early); + int expected_hit_count = 0; + int expected_miss_count = 0; + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), expected_miss_count); + + { + read_opts.timestamp = nullptr; + s = db_->Get(read_opts, "foo", &read_value); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + } + + // Mix use of Get + { + read_opts.timestamp = &ts_later_slice; + + // Use Get without ts first, expect cache entry to store the correct ts + s = db_->Get(read_opts, "foo2", &read_value); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), + ++expected_miss_count); + ASSERT_EQ(read_value, "bar2"); + + s = db_->Get(read_opts, "foo2", &read_value, &read_ts); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), ++expected_hit_count); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), expected_miss_count); + ASSERT_EQ(read_ts, ts_early); + ASSERT_EQ(read_value, "bar2"); + + // Use Get with ts first, expect the Get without ts can get correct record + s = db_->Get(read_opts, "foo3", &read_value, &read_ts); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), + ++expected_miss_count); + ASSERT_EQ(read_ts, ts_early); + ASSERT_EQ(read_value, "bar3"); + + s = db_->Get(read_opts, "foo3", &read_value); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), ++expected_hit_count); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), expected_miss_count); + ASSERT_EQ(read_value, "bar3"); + } + + { + // Test with consecutive calls of Get with ts. + s = db_->Get(read_opts, "foo", &read_value, &read_ts); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), + ++expected_miss_count); + ASSERT_EQ(read_ts, ts_early); + ASSERT_EQ(read_value, "bar"); + + // Test repeated get on cache entry + for (int i = 0; i < 3; i++) { + s = db_->Get(read_opts, "foo", &read_value, &read_ts); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), + ++expected_hit_count); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), + expected_miss_count); + ASSERT_EQ(read_ts, ts_early); + ASSERT_EQ(read_value, "bar"); + } + } { std::string ts_nothing = Timestamp(0, 0); @@ -1682,41 +1733,43 @@ TEST_F(DBBasicTestWithTimestamp, GetWithRowCache) { read_opts.timestamp = &ts_nothing_slice; s = db_->Get(read_opts, "foo", &read_value, &read_ts); ASSERT_TRUE(s.IsNotFound()); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); - - read_opts.timestamp = &ts_later_slice; - s = db_->Get(read_opts, "foo", &read_value, &read_ts); - ASSERT_OK(s); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), + ++expected_miss_count); } { read_opts.snapshot = snap_with_foo; - + read_opts.timestamp = &ts_later_slice; s = db_->Get(read_opts, "foo", &read_value, &read_ts); ASSERT_OK(s); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 3); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), + ++expected_miss_count); + ASSERT_EQ(read_ts, ts_early); + ASSERT_EQ(read_value, "bar"); s = db_->Get(read_opts, "foo", &read_value, &read_ts); ASSERT_OK(s); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 3); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), ++expected_hit_count); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), expected_miss_count); + ASSERT_EQ(read_ts, ts_early); + ASSERT_EQ(read_value, "bar"); } { read_opts.snapshot = snap_with_nothing; s = db_->Get(read_opts, "foo", &read_value, &read_ts); ASSERT_TRUE(s.IsNotFound()); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 4); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), + ++expected_miss_count); s = db_->Get(read_opts, "foo", &read_value, &read_ts); ASSERT_TRUE(s.IsNotFound()); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3); - ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 5); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), + ++expected_miss_count); } db_->ReleaseSnapshot(snap_with_nothing); @@ -1724,6 +1777,65 @@ TEST_F(DBBasicTestWithTimestamp, GetWithRowCache) { Close(); } +TEST_F(DBBasicTestWithTimestamp, GetWithRowCacheMultiSST) { + BlockBasedTableOptions table_options; + table_options.block_size = 1; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + LRUCacheOptions cache_options; + cache_options.capacity = 8192; + options.row_cache = cache_options.MakeSharedRowCache(); + + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreateStringAppendTESTOperator(); + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + + std::string ts_early = Timestamp(1, 0); + std::string ts_later = Timestamp(10, 0); + Slice ts_later_slice = ts_later; + + ASSERT_OK(db_->Put(WriteOptions(), "foo", ts_early, "v1")); + ASSERT_OK(Flush()); + + ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily(); + ASSERT_OK( + db_->Merge(WriteOptions(), default_cf, "foo", Timestamp(2, 0), "v2")); + ASSERT_OK( + db_->Merge(WriteOptions(), default_cf, "foo", Timestamp(3, 0), "v3")); + ASSERT_OK(Flush()); + + ReadOptions read_opts; + read_opts.timestamp = &ts_later_slice; + + std::string read_value; + std::string read_ts; + Status s; + + { + // Since there are two SST files, will trigger the table lookup twice. + s = db_->Get(read_opts, "foo", &read_value, &read_ts); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(read_ts, Timestamp(3, 0)); + ASSERT_EQ(read_value, "v1,v2,v3"); + + s = db_->Get(read_opts, "foo", &read_value, &read_ts); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(read_ts, Timestamp(3, 0)); + ASSERT_EQ(read_value, "v1,v2,v3"); + } +} + TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetPrefixFilter) { Options options = CurrentOptions(); options.env = env_; @@ -2498,10 +2610,10 @@ TEST_F(DataVisibilityTest, MultiGetWithoutSnapshot) { SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->LoadDependency({ - {"DBImpl::MultiGet:AfterGetSeqNum1", + {"DBImpl::MultiCFSnapshot:AfterGetSeqNum1", "DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut"}, {"DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut", - "DBImpl::MultiGet:AfterGetSeqNum2"}, + "DBImpl::MultiCFSnapshot:AfterGetSeqNum2"}, }); SyncPoint::GetInstance()->EnableProcessing(); port::Thread writer_thread([this]() { @@ -2520,7 +2632,7 @@ TEST_F(DataVisibilityTest, MultiGetWithoutSnapshot) { auto ss = db_->MultiGet(read_opts, keys, &values); writer_thread.join(); - for (auto s : ss) { + for (const auto& s : ss) { ASSERT_TRUE(s.IsNotFound()); } VerifyDefaultCF(); @@ -2790,8 +2902,8 @@ TEST_P(DBBasicTestWithTimestampCompressionSettings, PutDeleteGet) { // A class which remembers the name of each flushed file. class FlushedFileCollector : public EventListener { public: - FlushedFileCollector() {} - ~FlushedFileCollector() override {} + FlushedFileCollector() = default; + ~FlushedFileCollector() override = default; void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { InstrumentedMutexLock lock(&mutex_); @@ -2973,7 +3085,7 @@ TEST_F(DBBasicTestWithTimestamp, BatchWriteAndMultiGet) { key_vals.push_back(Key1(j)); } for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { - keys.push_back(key_vals[j]); + keys.emplace_back(key_vals[j]); } ReadOptions ropts; @@ -3378,7 +3490,7 @@ TEST_F(UpdateFullHistoryTsLowTest, ConcurrentUpdate) { VersionEdit* version_edit; SyncPoint::GetInstance()->SetCallBack( "DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit", - [&](void* arg) { version_edit = reinterpret_cast(arg); }); + [&](void* arg) { version_edit = static_cast(arg); }); SyncPoint::GetInstance()->SetCallBack( "VersionSet::LogAndApply:BeforeWriterWaiting", [&](void* /*arg*/) { version_edit->SetFullHistoryTsLow(higher_ts_low); }); @@ -3403,7 +3515,7 @@ class HandleFileBoundariesTest : DBBasicTestWithTimestampBase("/handle_file_boundaries") {} }; -TEST_P(HandleFileBoundariesTest, ConfigurePersistUdt) { +TEST_P(HandleFileBoundariesTest, ConfigurePersistUdtWithPut) { Options options = CurrentOptions(); options.env = env_; // Write a timestamp that is not the min timestamp to help test the behavior @@ -3427,7 +3539,7 @@ TEST_P(HandleFileBoundariesTest, ConfigurePersistUdt) { ASSERT_OK( db_->Put(WriteOptions(), largest_ukey_without_ts, write_ts, "val2")); - // Create a L0 SST file and its record is added to the Manfiest. + // Create a L0 SST file and its record is added to the Manifest. ASSERT_OK(Flush()); Close(); @@ -3459,6 +3571,61 @@ TEST_P(HandleFileBoundariesTest, ConfigurePersistUdt) { Close(); } +TEST_P(HandleFileBoundariesTest, ConfigurePersistUdtWithRangeDelete) { + Options options = CurrentOptions(); + options.env = env_; + // Write a timestamp that is not the min/max timestamp to help test the + // behavior of flag `persist_user_defined_timestamps`. + std::string write_ts; + std::string min_ts; + std::string max_ts; + PutFixed64(&write_ts, 1); + PutFixed64(&min_ts, 0); + PutFixed64(&max_ts, std::numeric_limits::max()); + std::string smallest_ukey_without_ts = "bar"; + std::string largest_ukey_without_ts = "foo"; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + bool persist_udt = test::ShouldPersistUDT(GetParam()); + options.persist_user_defined_timestamps = persist_udt; + if (!persist_udt) { + options.allow_concurrent_memtable_write = false; + } + DestroyAndReopen(options); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + smallest_ukey_without_ts, largest_ukey_without_ts, + write_ts)); + + // Create a L0 SST file and its record is added to the Manifest. + ASSERT_OK(Flush()); + Close(); + + options.create_if_missing = false; + // Reopen the DB and process manifest file. + Reopen(options); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GT(level_to_files.size(), 1); + // L0 only has one SST file. + ASSERT_EQ(level_to_files[0].size(), 1); + auto file_meta = level_to_files[0][0]; + if (persist_udt) { + ASSERT_EQ(smallest_ukey_without_ts + write_ts, + file_meta.smallest.user_key()); + } else { + ASSERT_EQ(smallest_ukey_without_ts + min_ts, file_meta.smallest.user_key()); + } + // When right file boundary comes from range deletion, it uses max timestamp + // and a range deletion sentinel that uses the max sequence number to mark the + // end key exclusive. This is regardless of whether timestamp is persisted. + ASSERT_EQ(largest_ukey_without_ts + max_ts, file_meta.largest.user_key()); + auto largest_footer = ExtractInternalKeyFooter(file_meta.largest.Encode()); + ASSERT_EQ(largest_footer, kRangeTombstoneSentinel); + Close(); +} + INSTANTIATE_TEST_CASE_P( ConfigurePersistUdt, HandleFileBoundariesTest, ::testing::Values( @@ -3679,12 +3846,12 @@ TEST_F(DBBasicTestWithTimestamp, FullHistoryTsLowSanityCheckFail) { std::vector keys; std::vector values; for (size_t j = 0; j < 2; ++j) { - keys.push_back(key_vals[j]); + keys.emplace_back(key_vals[j]); } std::vector statuses = db_->MultiGet(read_opts, cfhs, keys, &values); - for (auto status : statuses) { + for (const auto& status : statuses) { ASSERT_TRUE(status.IsInvalidArgument()); } } @@ -3696,12 +3863,12 @@ TEST_F(DBBasicTestWithTimestamp, FullHistoryTsLowSanityCheckFail) { std::vector keys; std::vector values; for (size_t j = 0; j < 1; ++j) { - keys.push_back(key_vals[j]); + keys.emplace_back(key_vals[j]); } std::vector statuses = db_->MultiGet(read_opts, one_cfh, keys, &values); - for (auto status : statuses) { + for (const auto& status : statuses) { ASSERT_TRUE(status.IsInvalidArgument()); } } @@ -3714,7 +3881,7 @@ TEST_F(DBBasicTestWithTimestamp, FullHistoryTsLowSanityCheckFail) { Status statuses[] = {Status::OK(), Status::OK()}; db_->MultiGet(read_opts, /*num_keys=*/2, &column_families[0], &keys[0], &values[0], &statuses[0], /*sorted_input=*/false); - for (auto status : statuses) { + for (const auto& status : statuses) { ASSERT_TRUE(status.IsInvalidArgument()); } } @@ -3727,7 +3894,7 @@ TEST_F(DBBasicTestWithTimestamp, FullHistoryTsLowSanityCheckFail) { Status statuses[] = {Status::OK()}; db_->MultiGet(read_opts, /*num_keys=*/1, &one_column_family[0], &keys[0], &values[0], &statuses[0], /*sorted_input=*/false); - for (auto status : statuses) { + for (const auto& status : statuses) { ASSERT_TRUE(status.IsInvalidArgument()); } } @@ -3894,42 +4061,80 @@ TEST_F(DBBasicTestWithTimestamp, Close(); } -TEST_P(DBBasicTestWithTimestampTableOptions, DeleteRangeBaiscReadAndIterate) { +class DeleteRangeWithTimestampTableOptions + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface< + std::tuple> { + public: + explicit DeleteRangeWithTimestampTableOptions() + : DBBasicTestWithTimestampBase( + "delete_range_with_timestamp_table_options") {} +}; + +INSTANTIATE_TEST_CASE_P( + Timestamp, DeleteRangeWithTimestampTableOptions, + testing::Combine( + testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey), + testing::Values( + test::UserDefinedTimestampTestMode::kNormal, + test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp))); + +TEST_P(DeleteRangeWithTimestampTableOptions, BasicReadAndIterate) { const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25; Options options = CurrentOptions(); + options.disable_auto_compactions = true; options.prefix_extractor.reset(NewFixedPrefixTransform(3)); options.compression = kNoCompression; BlockBasedTableOptions bbto; - bbto.index_type = GetParam(); + bbto.index_type = std::get<0>(GetParam()); bbto.block_size = 100; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.env = env_; options.create_if_missing = true; - const size_t kTimestampSize = Timestamp(0, 0).size(); - TestComparator test_cmp(kTimestampSize); - options.comparator = &test_cmp; + bool persist_udt = test::ShouldPersistUDT(std::get<1>(GetParam())); + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.persist_user_defined_timestamps = persist_udt; + // UDT in memtables only not compatible with concurrent memtable writes. + options.allow_concurrent_memtable_write = persist_udt; options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); DestroyAndReopen(options); // Write half of the keys before the tombstone and half after the tombstone. // Only covered keys (i.e., within the range and older than the tombstone) // should be deleted. + std::string full_history_ts_low; + int cutoff_ts = 0; for (int i = 0; i < kNum; ++i) { + std::string write_ts; + PutFixed64(&write_ts, i); if (i == kNum / 2) { ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key1(kRangeBegin), Key1(kRangeEnd), - Timestamp(i, 0))); + Key1(kRangeBegin), Key1(kRangeEnd), write_ts)); } - ASSERT_OK(db_->Put(WriteOptions(), Key1(i), Timestamp(i, 0), - "val" + std::to_string(i))); + ASSERT_OK( + db_->Put(WriteOptions(), Key1(i), write_ts, "val" + std::to_string(i))); if (i == kNum - kNumPerFile) { + if (!persist_udt) { + // When UDTs are not persisted, mark the timestamps in the Memtables as + // all expired so the followed flush can go through. + cutoff_ts = i + 1; + PutFixed64(&full_history_ts_low, cutoff_ts); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + full_history_ts_low)); + } ASSERT_OK(Flush()); } } ReadOptions read_opts; read_opts.total_order_seek = true; - std::string read_ts = Timestamp(kNum, 0); + std::string read_ts; + PutFixed64(&read_ts, kNum); Slice read_ts_slice = read_ts; read_opts.timestamp = &read_ts_slice; { @@ -3964,33 +4169,43 @@ TEST_P(DBBasicTestWithTimestampTableOptions, DeleteRangeBaiscReadAndIterate) { ASSERT_OK(iter->status()); ASSERT_EQ(-1, expected); - read_ts = Timestamp(0, 0); - read_ts_slice = read_ts; - read_opts.timestamp = &read_ts_slice; - iter.reset(db_->NewIterator(read_opts)); - iter->SeekToFirst(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key(), Key1(0)); - iter->Next(); - ASSERT_FALSE(iter->Valid()); - ASSERT_OK(iter->status()); + // Cannot read below the cutoff timestamp when timestamps are not persisted. + if (persist_udt) { + read_ts.clear(); + PutFixed64(&read_ts, 0); + read_ts_slice = read_ts; + read_opts.timestamp = &read_ts_slice; + iter.reset(db_->NewIterator(read_opts)); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key1(0)); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } } - read_ts = Timestamp(kNum, 0); + read_ts.clear(); + PutFixed64(&read_ts, kNum); read_ts_slice = read_ts; read_opts.timestamp = &read_ts_slice; std::string value, timestamp; Status s; + std::string expected_ts; + int int_expected_ts; for (int i = 0; i < kNum; ++i) { s = db_->Get(read_opts, Key1(i), &value, ×tamp); if (i >= kRangeBegin && i < kNum / 2) { ASSERT_TRUE(s.IsNotFound()); - ASSERT_EQ(timestamp, Timestamp(kNum / 2, 0)); + int_expected_ts = (persist_udt || kNum / 2 >= cutoff_ts) ? kNum / 2 : 0; } else { ASSERT_OK(s); ASSERT_EQ(value, "val" + std::to_string(i)); - ASSERT_EQ(timestamp, Timestamp(i, 0)); + int_expected_ts = (persist_udt || i >= cutoff_ts) ? i : 0; } + expected_ts.clear(); + PutFixed64(&expected_ts, int_expected_ts); + ASSERT_EQ(timestamp, expected_ts); } size_t batch_size = kNum; @@ -4009,11 +4224,41 @@ TEST_P(DBBasicTestWithTimestampTableOptions, DeleteRangeBaiscReadAndIterate) { for (int i = 0; i < kNum; ++i) { if (i >= kRangeBegin && i < kNum / 2) { ASSERT_TRUE(statuses[i].IsNotFound()); - ASSERT_EQ(timestamps[i], Timestamp(kNum / 2, 0)); + int_expected_ts = (persist_udt || kNum / 2 >= cutoff_ts) ? kNum / 2 : 0; } else { ASSERT_OK(statuses[i]); ASSERT_EQ(values[i], "val" + std::to_string(i)); - ASSERT_EQ(timestamps[i], Timestamp(i, 0)); + int_expected_ts = (persist_udt || i >= cutoff_ts) ? i : 0; + } + expected_ts.clear(); + PutFixed64(&expected_ts, int_expected_ts); + ASSERT_EQ(timestamps[i], expected_ts); + } + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + if (!persist_udt) { + // Mark everything expired so manual compaction can go through + full_history_ts_low.clear(); + PutFixed64(&full_history_ts_low, kNum); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + full_history_ts_low)); + } + Slice compaction_ts = full_history_ts_low; + cro.full_history_ts_low = &compaction_ts; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + for (int i = kRangeBegin; i < kNum / 2; ++i) { + s = db_->Get(read_opts, Key1(i), &value, ×tamp); + ASSERT_TRUE(s.IsNotFound()); + if (persist_udt) { + expected_ts.clear(); + PutFixed64(&expected_ts, kNum / 2); + ASSERT_EQ(timestamp, expected_ts); + } else { + // When timestamps are not persisted, data in SST files all logically have + // min timestamp. A compaction to the last level will drop the range + // tombstone. + ASSERT_TRUE(timestamp.empty()); } } Close(); @@ -4442,8 +4687,8 @@ TEST_F(DBBasicTestWithTimestamp, TimestampFilterTableReadOnGet) { Slice read_ts_slice = Slice(read_ts_str); ReadOptions read_opts; read_opts.timestamp = &read_ts_slice; - std::string value_from_get = ""; - std::string timestamp_from_get = ""; + std::string value_from_get; + std::string timestamp_from_get; auto status = db_->Get(read_opts, Key1(3), &value_from_get, ×tamp_from_get); ASSERT_TRUE(status.IsNotFound()); @@ -4492,4 +4737,4 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); -} \ No newline at end of file +} diff --git a/db/db_with_timestamp_compaction_test.cc b/db/db_with_timestamp_compaction_test.cc index 7d80c85c42b..783140cbf7d 100644 --- a/db/db_with_timestamp_compaction_test.cc +++ b/db/db_with_timestamp_compaction_test.cc @@ -64,7 +64,7 @@ TEST_F(TimestampCompatibleCompactionTest, UserKeyCrossFileBoundary) { SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - const auto* compaction = reinterpret_cast(arg); + const auto* compaction = static_cast(arg); ASSERT_NE(nullptr, compaction); ASSERT_EQ(0, compaction->start_level()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -172,8 +172,8 @@ TEST_F(TimestampCompatibleCompactionTest, MultipleSubCompactions) { class TestFilePartitioner : public SstPartitioner { public: - explicit TestFilePartitioner() {} - ~TestFilePartitioner() override {} + explicit TestFilePartitioner() = default; + ~TestFilePartitioner() override = default; const char* Name() const override { return "TestFilePartitioner"; } PartitionerResult ShouldPartition( @@ -188,7 +188,7 @@ class TestFilePartitioner : public SstPartitioner { class TestFilePartitionerFactory : public SstPartitionerFactory { public: - explicit TestFilePartitionerFactory() {} + explicit TestFilePartitionerFactory() = default; std::unique_ptr CreatePartitioner( const SstPartitioner::Context& /*context*/) const override { std::unique_ptr ret = diff --git a/db/db_write_buffer_manager_test.cc b/db/db_write_buffer_manager_test.cc index eb33ec41e12..8021dbedb94 100644 --- a/db/db_write_buffer_manager_test.cc +++ b/db/db_write_buffer_manager_test.cc @@ -119,7 +119,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::WriteStall::Wait", [&](void* arg) { InstrumentedMutexLock lock(&mutex); - WriteThread::Writer* w = reinterpret_cast(arg); + WriteThread::Writer* w = static_cast(arg); w_set.insert(w); // Allow the flush to continue if all writer threads are blocked. if (w_set.size() == (unsigned long)num_writers) { @@ -368,7 +368,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) { }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::WriteStall::Wait", [&](void* arg) { - WriteThread::Writer* w = reinterpret_cast(arg); + WriteThread::Writer* w = static_cast(arg); { InstrumentedMutexLock lock(&mutex); w_set.insert(w); @@ -511,7 +511,7 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) { "WriteThread::WriteStall::Wait", [&](void* arg) { { InstrumentedMutexLock lock(&mutex); - WriteThread::Writer* w = reinterpret_cast(arg); + WriteThread::Writer* w = static_cast(arg); w_slowdown_set.insert(w); // Allow the flush continue if all writer threads are blocked. if (w_slowdown_set.size() + (unsigned long)w_no_slowdown.load( @@ -674,7 +674,7 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::WriteStall::Wait", [&](void* arg) { - WriteThread::Writer* w = reinterpret_cast(arg); + WriteThread::Writer* w = static_cast(arg); InstrumentedMutexLock lock(&mutex); w_slowdown_set.insert(w); // Allow the flush continue if all writer threads are blocked. diff --git a/db/db_write_test.cc b/db/db_write_test.cc index 59c26eaaaf5..f464a3036b8 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -269,6 +269,47 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } +TEST_P(DBWriteTest, WriteThreadWaitNanosCounter) { + Options options = GetOptions(); + std::vector threads; + + Reopen(options); + + std::function write_func = [&]() { + PerfContext* perf_ctx = get_perf_context(); + SetPerfLevel(PerfLevel::kEnableWait); + perf_ctx->Reset(); + TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:WriteFunc"); + ASSERT_OK(dbfull()->Put(WriteOptions(), "bar", "val2")); + ASSERT_GT(perf_ctx->write_thread_wait_nanos, 2000000U); + }; + + std::function sleep_func = [&]() { + TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:1"); + SystemClock::Default()->SleepForMicroseconds(2000); + TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:2"); + }; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WriteThread::EnterAsBatchGroupLeader:End", + "DBWriteTest::WriteThreadWaitNanosCounter:WriteFunc"}, + {"WriteThread::AwaitState:BlockingWaiting", + "DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:1"}, + {"DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:2", + "WriteThread::ExitAsBatchGroupLeader:Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + threads.emplace_back(sleep_func); + threads.emplace_back(write_func); + + ASSERT_OK(dbfull()->Put(WriteOptions(), "foo", "val1")); + + for (auto& t : threads) { + t.join(); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { constexpr int kNumThreads = 5; std::unique_ptr mock_env( @@ -286,7 +327,7 @@ TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { ready_count++; - auto* w = reinterpret_cast(arg); + auto* w = static_cast(arg); if (w->state == WriteThread::STATE_GROUP_LEADER) { leader_count++; while (ready_count < kNumThreads) { @@ -296,7 +337,7 @@ TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { }); SyncPoint::GetInstance()->EnableProcessing(); for (int i = 0; i < kNumThreads; i++) { - threads.push_back(port::Thread( + threads.emplace_back( [&](int index) { // All threads should fail. auto res = Put("key" + std::to_string(index), "value"); @@ -313,7 +354,7 @@ TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { ASSERT_FALSE(res.ok()); } }, - i)); + i); } for (int i = 0; i < kNumThreads; i++) { threads[i].join(); @@ -384,7 +425,7 @@ TEST_F(DBWriteTestUnparameterized, PipelinedWriteRace) { second_write_in_progress = true; return; } - auto* w = reinterpret_cast(arg); + auto* w = static_cast(arg); if (w->state == WriteThread::STATE_GROUP_LEADER) { active_writers++; if (leader.load() == nullptr) { @@ -404,7 +445,7 @@ TEST_F(DBWriteTestUnparameterized, PipelinedWriteRace) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::ExitAsBatchGroupLeader:Start", [&](void* arg) { - auto* wg = reinterpret_cast(arg); + auto* wg = static_cast(arg); if (wg->leader == leader && !finished_WAL_write) { finished_WAL_write = true; while (active_writers.load() < 3) { @@ -416,7 +457,7 @@ TEST_F(DBWriteTestUnparameterized, PipelinedWriteRace) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::ExitAsBatchGroupLeader:AfterCompleteWriters", [&](void* arg) { - auto* wg = reinterpret_cast(arg); + auto* wg = static_cast(arg); if (wg->leader == leader) { while (!second_write_in_progress.load()) { // wait for the old follower thread to start the next write @@ -780,6 +821,95 @@ TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) { ASSERT_LE(bytes_num, 1024 * 100); } +void CorruptLogFile(Env* env, Options& options, std::string log_path, + uint64_t log_num, int record_num) { + std::shared_ptr fs = env->GetFileSystem(); + std::unique_ptr file_reader; + Status status; + { + std::unique_ptr file; + status = fs->NewSequentialFile(log_path, FileOptions(), &file, nullptr); + ASSERT_EQ(status, IOStatus::OK()); + file_reader.reset(new SequentialFileReader(std::move(file), log_path)); + } + std::unique_ptr reader(new log::Reader( + nullptr, std::move(file_reader), nullptr, false, log_num)); + std::string scratch; + Slice record; + uint64_t record_checksum; + for (int i = 0; i < record_num; ++i) { + ASSERT_TRUE(reader->ReadRecord(&record, &scratch, options.wal_recovery_mode, + &record_checksum)); + } + uint64_t rec_start = reader->LastRecordOffset(); + reader.reset(); + { + std::unique_ptr file; + status = fs->NewRandomRWFile(log_path, FileOptions(), &file, nullptr); + ASSERT_EQ(status, IOStatus::OK()); + uint32_t bad_lognum = 0xff; + ASSERT_EQ(file->Write( + rec_start + 7, + Slice(reinterpret_cast(&bad_lognum), sizeof(uint32_t)), + IOOptions(), nullptr), + IOStatus::OK()); + ASSERT_OK(file->Close(IOOptions(), nullptr)); + file.reset(); + } +} + +TEST_P(DBWriteTest, RecycleLogTest) { + Options options = GetOptions(); + options.recycle_log_file_num = 0; + options.avoid_flush_during_recovery = true; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + + Reopen(options); + ASSERT_OK(Put(Key(1), "val1")); + ASSERT_OK(Put(Key(2), "val1")); + + uint64_t latest_log_num = 0; + std::unique_ptr log_file; + ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file)); + latest_log_num = log_file->LogNumber(); + Reopen(options); + ASSERT_OK(Put(Key(3), "val3")); + + // Corrupt second entry of first log + std::string log_path = LogFileName(dbname_, latest_log_num); + CorruptLogFile(env_, options, log_path, latest_log_num, 2); + + Reopen(options); + ASSERT_EQ(Get(Key(1)), "val1"); + ASSERT_EQ(Get(Key(2)), "NOT_FOUND"); + ASSERT_EQ(Get(Key(3)), "NOT_FOUND"); +} + +TEST_P(DBWriteTest, RecycleLogTestCFAheadOfWAL) { + Options options = GetOptions(); + options.recycle_log_file_num = 0; + options.avoid_flush_during_recovery = true; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, Key(1), "val1")); + ASSERT_OK(Put(0, Key(2), "val2")); + + uint64_t latest_log_num = 0; + std::unique_ptr log_file; + ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file)); + latest_log_num = log_file->LogNumber(); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, Key(3), "val3")); + + // Corrupt second entry of first log + std::string log_path = LogFileName(dbname_, latest_log_num); + CorruptLogFile(env_, options, log_path, latest_log_num, 2); + + ASSERT_EQ(TryReopenWithColumnFamilies({"default", "pikachu"}, options), + Status::Corruption()); +} + INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest, testing::Values(DBTestBase::kDefault, DBTestBase::kConcurrentWALWrites, diff --git a/db/dbformat.cc b/db/dbformat.cc index 63bb354de87..2378ba488b9 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -8,9 +8,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/dbformat.h" -#include - #include +#include #include "db/lookup_key.h" #include "monitoring/perf_context_imp.h" @@ -26,9 +25,9 @@ namespace ROCKSDB_NAMESPACE { // and the value type is embedded as the low 8 bits in the sequence // number in internal keys, we need to use the highest-numbered // ValueType, not the lowest). -const ValueType kValueTypeForSeek = kTypeWideColumnEntity; +const ValueType kValueTypeForSeek = kTypeValuePreferredSeqno; const ValueType kValueTypeForSeekForPrev = kTypeDeletion; -const std::string kDisableUserTimestamp(""); +const std::string kDisableUserTimestamp; EntryType GetEntryType(ValueType value_type) { switch (value_type) { @@ -67,6 +66,13 @@ void AppendInternalKeyWithDifferentTimestamp(std::string* result, PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); } +void AppendUserKeyWithDifferentTimestamp(std::string* result, const Slice& key, + const Slice& ts) { + assert(key.size() >= ts.size()); + result->append(key.data(), key.size() - ts.size()); + result->append(ts.data(), ts.size()); +} + void AppendInternalKeyFooter(std::string* result, SequenceNumber s, ValueType t) { PutFixed64(result, PackSequenceAndType(s, t)); @@ -111,6 +117,7 @@ void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key, void PadInternalKeyWithMinTimestamp(std::string* result, const Slice& key, size_t ts_sz) { assert(ts_sz > 0); + assert(key.size() >= kNumInternalBytes); size_t user_key_size = key.size() - kNumInternalBytes; result->reserve(key.size() + ts_sz); result->append(key.data(), user_key_size); @@ -118,6 +125,17 @@ void PadInternalKeyWithMinTimestamp(std::string* result, const Slice& key, result->append(key.data() + user_key_size, kNumInternalBytes); } +void PadInternalKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + assert(key.size() >= kNumInternalBytes); + size_t user_key_size = key.size() - kNumInternalBytes; + result->reserve(key.size() + ts_sz); + result->append(key.data(), user_key_size); + result->append(std::string(ts_sz, '\xff')); + result->append(key.data() + user_key_size, kNumInternalBytes); +} + void StripTimestampFromInternalKey(std::string* result, const Slice& key, size_t ts_sz) { assert(key.size() >= ts_sz + kNumInternalBytes); diff --git a/db/dbformat.h b/db/dbformat.h index 981866c09dc..5b16726693e 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -67,7 +68,9 @@ enum ValueType : unsigned char { kTypeDeletionWithTimestamp = 0x14, kTypeCommitXIDAndTimestamp = 0x15, // WAL only kTypeWideColumnEntity = 0x16, - kTypeColumnFamilyWideColumnEntity = 0x17, // WAL only + kTypeColumnFamilyWideColumnEntity = 0x17, // WAL only + kTypeValuePreferredSeqno = 0x18, // Value with a unix write time + kTypeColumnFamilyValuePreferredSeqno = 0x19, // WAL only kTypeMaxValid, // Should be after the last valid type, only used for // validation kMaxValue = 0x7F // Not used for storing records. @@ -77,11 +80,38 @@ enum ValueType : unsigned char { extern const ValueType kValueTypeForSeek; extern const ValueType kValueTypeForSeekForPrev; +// A range of user keys used internally by RocksDB. Also see `Range` used by +// public APIs. +struct UserKeyRange { + // In case of user_defined timestamp, if enabled, `start` and `limit` should + // include user_defined timestamps. + Slice start; + Slice limit; + + UserKeyRange() = default; + UserKeyRange(const Slice& s, const Slice& l) : start(s), limit(l) {} +}; + +// A range of user keys used internally by RocksDB. Also see `RangePtr` used by +// public APIs. +struct UserKeyRangePtr { + // In case of user_defined timestamp, if enabled, `start` and `limit` should + // point to key with timestamp part. + // An optional range start, if missing, indicating a start before all keys. + std::optional start; + // An optional range end, if missing, indicating an end after all keys. + std::optional limit; + + UserKeyRangePtr(const std::optional& s, const std::optional& l) + : start(s), limit(l) {} +}; + // Checks whether a type is an inline value type // (i.e. a type used in memtable skiplist and sst file datablock). inline bool IsValueType(ValueType t) { return t <= kTypeMerge || kTypeSingleDeletion == t || kTypeBlobIndex == t || - kTypeDeletionWithTimestamp == t || kTypeWideColumnEntity == t; + kTypeDeletionWithTimestamp == t || kTypeWideColumnEntity == t || + kTypeValuePreferredSeqno == t; } // Checks whether a type is from user operation @@ -165,6 +195,9 @@ inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, // assert(IsExtendedValueType(*t)); } +const uint64_t kRangeTombstoneSentinel = + PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); + EntryType GetEntryType(ValueType value_type); // Append the serialization of "key" to *result. @@ -184,6 +217,15 @@ void AppendInternalKeyWithDifferentTimestamp(std::string* result, const ParsedInternalKey& key, const Slice& ts); +// Append the user key to *result, replacing the original timestamp with +// argument ts. +// +// input [user key]: +// output before: empty +// output after: +void AppendUserKeyWithDifferentTimestamp(std::string* result, const Slice& key, + const Slice& ts); + // Serialized internal key consists of user key followed by footer. // This function appends the footer to *result, assuming that *result already // contains the user key at the end. @@ -237,6 +279,16 @@ void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key, void PadInternalKeyWithMinTimestamp(std::string* result, const Slice& key, size_t ts_sz); +// `key` is an internal key containing a user key without timestamp. Create a +// new key in *result by padding a max timestamp of size `ts_sz` to the user key +// and copying the remaining internal key bytes. +// +// input [internal key]: +// output before: empty +// output after: +void PadInternalKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + // `key` is an internal key containing a user key with timestamp of size // `ts_sz`. Create a new internal key in *result by stripping the timestamp from // the user key and copying the remaining internal key bytes. @@ -423,7 +475,7 @@ class InternalKey { void Set(const Slice& _user_key_with_ts, SequenceNumber s, ValueType t, const Slice& ts) { - ParsedInternalKey pik = ParsedInternalKey(_user_key_with_ts, s, t); + ParsedInternalKey pik(_user_key_with_ts, s, t); // Should not call pik.SetTimestamp() directly as it overwrites the buffer // containing _user_key. SetFrom(pik, ts); @@ -821,19 +873,19 @@ class InternalKeySliceTransform : public SliceTransform { explicit InternalKeySliceTransform(const SliceTransform* transform) : transform_(transform) {} - virtual const char* Name() const override { return transform_->Name(); } + const char* Name() const override { return transform_->Name(); } - virtual Slice Transform(const Slice& src) const override { + Slice Transform(const Slice& src) const override { auto user_key = ExtractUserKey(src); return transform_->Transform(user_key); } - virtual bool InDomain(const Slice& src) const override { + bool InDomain(const Slice& src) const override { auto user_key = ExtractUserKey(src); return transform_->InDomain(user_key); } - virtual bool InRange(const Slice& dst) const override { + bool InRange(const Slice& dst) const override { auto user_key = ExtractUserKey(dst); return transform_->InRange(user_key); } @@ -860,7 +912,8 @@ bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record); // resulting from this call will include timestamp. Status ReadRecordFromWriteBatch(Slice* input, char* tag, uint32_t* column_family, Slice* key, - Slice* value, Slice* blob, Slice* xid); + Slice* value, Slice* blob, Slice* xid, + uint64_t* write_unix_time); // When user call DeleteRange() to delete a range of keys, // we will store a serialized RangeTombstone in MemTable and SST. @@ -883,17 +936,25 @@ struct RangeTombstone { // User-defined timestamp is enabled, `sk` and `ek` should be user key // with timestamp, `ts` will replace the timestamps in `sk` and // `ek`. - RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts) - : seq_(sn), ts_(ts) { - assert(!ts.empty()); + // When `logical_strip_timestamp` is true, the timestamps in `sk` and `ek` + // will be replaced with min timestamp. + RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts, + bool logical_strip_timestamp) + : seq_(sn) { + const size_t ts_sz = ts.size(); + assert(ts_sz > 0); pinned_start_key_.reserve(sk.size()); - pinned_start_key_.append(sk.data(), sk.size() - ts.size()); - pinned_start_key_.append(ts.data(), ts.size()); pinned_end_key_.reserve(ek.size()); - pinned_end_key_.append(ek.data(), ek.size() - ts.size()); - pinned_end_key_.append(ts.data(), ts.size()); + if (logical_strip_timestamp) { + AppendUserKeyWithMinTimestamp(&pinned_start_key_, sk, ts_sz); + AppendUserKeyWithMinTimestamp(&pinned_end_key_, ek, ts_sz); + } else { + AppendUserKeyWithDifferentTimestamp(&pinned_start_key_, sk, ts); + AppendUserKeyWithDifferentTimestamp(&pinned_end_key_, ek, ts); + } start_key_ = pinned_start_key_; end_key_ = pinned_end_key_; + ts_ = Slice(pinned_start_key_.data() + sk.size() - ts_sz, ts_sz); } RangeTombstone(ParsedInternalKey parsed_key, Slice value) { diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index b6d4f559e2e..7a86e913936 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -7,9 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include - +#include #include #include #include @@ -106,7 +104,7 @@ class DeleteFileTest : public DBTestBase { ASSERT_OK(env_->GetChildren(dir, &filenames)); int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; - for (auto file : filenames) { + for (const auto& file : filenames) { uint64_t number; FileType type; if (ParseFileName(file, &number, &type)) { @@ -127,7 +125,7 @@ class DeleteFileTest : public DBTestBase { } static void DoSleep(void* arg) { - auto test = reinterpret_cast(arg); + auto test = static_cast(arg); test->env_->SleepForMicroseconds(2 * 1000 * 1000); } @@ -148,9 +146,9 @@ TEST_F(DeleteFileTest, AddKeysAndQueryLevels) { std::vector metadata; db_->GetLiveFilesMetaData(&metadata); - std::string level1file = ""; + std::string level1file; int level1keycount = 0; - std::string level2file = ""; + std::string level2file; int level2keycount = 0; int level1index = 0; int level2index = 1; diff --git a/db/error_handler.cc b/db/error_handler.cc index f4326100182..35148e72e5e 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -234,7 +234,7 @@ void ErrorHandler::CancelErrorRecovery() { // recovery gets scheduled at that point auto_recovery_ = false; SstFileManagerImpl* sfm = - reinterpret_cast(db_options_.sst_file_manager.get()); + static_cast(db_options_.sst_file_manager.get()); if (sfm) { // This may or may not cancel a pending recovery db_mutex_->Unlock(); @@ -277,10 +277,6 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err, return kOkStatus; } - if (bg_error_stats_ != nullptr) { - RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); - RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED); - } ROCKS_LOG_INFO(db_options_.info_log, "ErrorHandler: Set regular background error\n"); @@ -365,25 +361,28 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err, // This is the main function for looking at IO related error during the // background operations. The main logic is: -// 1) File scope IO error is treated as retryable IO error in the write -// path. In RocksDB, If a file has write IO error and it is at file scope, -// RocksDB never write to the same file again. RocksDB will create a new -// file and rewrite the whole content. Thus, it is retryable. +// File scope IO error is treated as retryable IO error in the write path. In +// RocksDB, If a file has write IO error and it is at file scope, RocksDB never +// write to the same file again. RocksDB will create a new file and rewrite the +// whole content. Thus, it is retryable. +// There are three main categories of error handling: // 1) if the error is caused by data loss, the error is mapped to // unrecoverable error. Application/user must take action to handle // this situation (File scope case is excluded). // 2) if the error is a Retryable IO error (i.e., it is a file scope IO error, -// or its retryable flag is set and not a data loss error), auto resume -// will be called and the auto resume can be controlled by resume count -// and resume interval options. There are three sub-cases: +// or its retryable flag is set and not a data loss error), auto resume ( +// DBImpl::ResumeImpl) may be called and the auto resume can be controlled +// by resume count and resume interval options. There are three sub-cases: // a) if the error happens during compaction, it is mapped to a soft error. -// the compaction thread will reschedule a new compaction. +// the compaction thread will reschedule a new compaction. This doesn't +// call auto resume. // b) if the error happens during flush and also WAL is empty, it is mapped // to a soft error. Note that, it includes the case that IO error happens -// in SST or manifest write during flush. -// c) all other errors are mapped to hard error. -// 3) for other cases, SetBGError(const Status& bg_err, BackgroundErrorReason -// reason) will be called to handle other error cases. +// in SST or manifest write during flush. Auto resume will be called. +// c) all other errors are mapped to hard error. Auto resume will be called. +// 3) for other cases, HandleKnownErrors(const Status& bg_err, +// BackgroundErrorReason reason) will be called to handle other error cases +// such as delegating to SstFileManager to handle no space error. const Status& ErrorHandler::SetBGError(const Status& bg_status, BackgroundErrorReason reason) { db_mutex_->AssertHeld(); @@ -396,6 +395,9 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status, ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s", bg_io_err.ToString().c_str()); + RecordStats({ERROR_HANDLER_BG_ERROR_COUNT, ERROR_HANDLER_BG_IO_ERROR_COUNT}, + {} /* int_histograms */); + Status new_bg_io_err = bg_io_err; DBRecoverContext context; if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile && @@ -405,14 +407,6 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status, bool auto_recovery = false; Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError); CheckAndSetRecoveryAndBGError(bg_err); - if (bg_error_stats_ != nullptr) { - RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); - RecordTick(bg_error_stats_.get(), - ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED); - RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); - RecordTick(bg_error_stats_.get(), - ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED); - } ROCKS_LOG_INFO( db_options_.info_log, "ErrorHandler: Set background IO error as unrecoverable error\n"); @@ -436,18 +430,9 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status, EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &new_bg_io_err, db_mutex_, &auto_recovery); - if (bg_error_stats_ != nullptr) { - RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); - RecordTick(bg_error_stats_.get(), - ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED); - RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); - RecordTick(bg_error_stats_.get(), - ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED); - RecordTick(bg_error_stats_.get(), - ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT); - RecordTick(bg_error_stats_.get(), - ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT_MISSPELLED); - } + + RecordStats({ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT}, + {} /* int_histograms */); ROCKS_LOG_INFO(db_options_.info_log, "ErrorHandler: Set background retryable IO error\n"); if (BackgroundErrorReason::kCompaction == reason) { @@ -456,17 +441,18 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status, // this case // TODO: a better way to set or clean the retryable IO error which // happens during compaction SST file write. - if (bg_error_stats_ != nullptr) { - RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); - } + RecordStats({ERROR_HANDLER_AUTORESUME_COUNT}, {} /* int_histograms */); ROCKS_LOG_INFO( db_options_.info_log, "ErrorHandler: Compaction will schedule by itself to resume\n"); // Not used in this code path. new_bg_io_err.PermitUncheckedError(); return bg_error_; - } else if (BackgroundErrorReason::kFlushNoWAL == reason || - BackgroundErrorReason::kManifestWriteNoWAL == reason) { + } + + Status::Severity severity; + if (BackgroundErrorReason::kFlushNoWAL == reason || + BackgroundErrorReason::kManifestWriteNoWAL == reason) { // When the BG Retryable IO error reason is flush without WAL, // We map it to a soft error. At the same time, all the background work // should be stopped except the BG work from recovery. Therefore, we @@ -474,24 +460,17 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status, // continues to receive writes when BG error is soft error, to avoid // to many small memtable being generated during auto resume, the flush // reason is set to kErrorRecoveryRetryFlush. - Status bg_err(new_bg_io_err, Status::Severity::kSoftError); - CheckAndSetRecoveryAndBGError(bg_err); + severity = Status::Severity::kSoftError; soft_error_no_bg_work_ = true; context.flush_reason = FlushReason::kErrorRecoveryRetryFlush; - recover_context_ = context; - return StartRecoverFromRetryableBGIOError(bg_io_err); } else { - Status bg_err(new_bg_io_err, Status::Severity::kHardError); - CheckAndSetRecoveryAndBGError(bg_err); - recover_context_ = context; - return StartRecoverFromRetryableBGIOError(bg_io_err); + severity = Status::Severity::kHardError; } + Status bg_err(new_bg_io_err, severity); + CheckAndSetRecoveryAndBGError(bg_err); + recover_context_ = context; + return StartRecoverFromRetryableBGIOError(bg_io_err); } else { - if (bg_error_stats_ != nullptr) { - RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); - RecordTick(bg_error_stats_.get(), - ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED); - } return HandleKnownErrors(new_bg_io_err, reason); } } @@ -555,7 +534,7 @@ Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error, void ErrorHandler::RecoverFromNoSpace() { SstFileManagerImpl* sfm = - reinterpret_cast(db_options_.sst_file_manager.get()); + static_cast(db_options_.sst_file_manager.get()); // Inform SFM of the error, so it can kick-off the recovery if (sfm) { @@ -593,7 +572,7 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) { // If its a manual recovery and there's a background recovery in progress // return busy status if (recovery_in_prog_) { - return Status::Busy(); + return Status::Busy("Recovery already in progress"); } recovery_in_prog_ = true; @@ -659,9 +638,7 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError( db_mutex_->AssertHeld(); return bg_error_; } - if (bg_error_stats_ != nullptr) { - RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); - } + RecordStats({ERROR_HANDLER_AUTORESUME_COUNT}, {} /* int_histograms */); ROCKS_LOG_INFO( db_options_.info_log, "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n"); @@ -729,19 +706,15 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { recovery_error_ = IOStatus::OK(); retry_count++; Status s = db_->ResumeImpl(context); - if (bg_error_stats_ != nullptr) { - RecordTick(bg_error_stats_.get(), - ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT); - } + RecordStats({ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT}, + {} /* int_histograms */); if (s.IsShutdownInProgress() || bg_error_.severity() >= Status::Severity::kFatalError) { // If DB shutdown in progress or the error severity is higher than // Hard Error, stop auto resume and returns. recovery_in_prog_ = false; - if (bg_error_stats_ != nullptr) { - RecordInHistogram(bg_error_stats_.get(), - ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); - } + RecordStats({} /* ticker_types */, + {{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}}); EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, bg_error_, db_mutex_); return; @@ -764,21 +737,15 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { // recover from the retryable IO error and no other BG errors. Clean // the bg_error and notify user. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess"); - if (bg_error_stats_ != nullptr) { - RecordTick(bg_error_stats_.get(), - ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT); - RecordInHistogram(bg_error_stats_.get(), - ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); - } + RecordStats({ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT}, + {{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}}); return; } else { // In this case: 1) recovery_error_ is more serious or not retryable // 2) other error happens. The auto recovery stops. recovery_in_prog_ = false; - if (bg_error_stats_ != nullptr) { - RecordInHistogram(bg_error_stats_.get(), - ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); - } + RecordStats({} /* ticker_types */, + {{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}}); EventHelpers::NotifyOnErrorRecoveryEnd( db_options_.listeners, bg_error_, !recovery_error_.ok() ? recovery_error_ : s, db_mutex_); @@ -792,11 +759,8 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { db_options_.listeners, bg_error_, Status::Aborted("Exceeded resume retry count"), db_mutex_); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut"); - if (bg_error_stats_ != nullptr) { - RecordInHistogram(bg_error_stats_.get(), - ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); - } - return; + RecordStats({} /* ticker_types */, + {{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}}); } void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) { @@ -809,7 +773,6 @@ void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) { if (bg_error_.severity() >= Status::Severity::kHardError) { is_db_stopped_.store(true, std::memory_order_release); } - return; } void ErrorHandler::EndAutoRecovery() { @@ -827,7 +790,22 @@ void ErrorHandler::EndAutoRecovery() { db_mutex_->Lock(); } TEST_SYNC_POINT("PostEndAutoRecovery"); - return; +} + +void ErrorHandler::RecordStats( + const std::vector& ticker_types, + const std::vector>& int_histograms) { + if (bg_error_stats_ == nullptr) { + return; + } + for (const auto& ticker_type : ticker_types) { + RecordTick(bg_error_stats_.get(), ticker_type); + } + + for (const auto& hist : int_histograms) { + RecordInHistogram(bg_error_stats_.get(), std::get<0>(hist), + std::get<1>(hist)); + } } } // namespace ROCKSDB_NAMESPACE diff --git a/db/error_handler.h b/db/error_handler.h index 1168d91fa87..0188471ac17 100644 --- a/db/error_handler.h +++ b/db/error_handler.h @@ -94,6 +94,10 @@ class ErrorHandler { void ClearFilesToQuarantine(); private: + void RecordStats( + const std::vector& ticker_types, + const std::vector>& int_histograms); + DBImpl* db_; const ImmutableDBOptions& db_options_; Status bg_error_; @@ -107,7 +111,10 @@ class ErrorHandler { std::unique_ptr recovery_thread_; InstrumentedMutex* db_mutex_; - // A flag indicating whether automatic recovery from errors is enabled + // A flag indicating whether automatic recovery from errors is enabled. Auto + // recovery applies for delegating to SstFileManager to handle no space type + // of errors. This flag doesn't control the auto resume behavior to recover + // from retryable IO errors. bool auto_recovery_; bool recovery_in_prog_; // A flag to indicate that for the soft error, we should not allow any diff --git a/db/event_helpers.cc b/db/event_helpers.cc index 65f6a5a4861..da1ed8ea3a2 100644 --- a/db/event_helpers.cc +++ b/db/event_helpers.cc @@ -145,7 +145,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( jwriter << "N/A"; } else { SeqnoToTimeMapping tmp; - Status status = tmp.Add(table_properties.seqno_to_time_mapping); + Status status = tmp.DecodeFrom(table_properties.seqno_to_time_mapping); if (status.ok()) { jwriter << tmp.ToHumanString(); } else { diff --git a/db/experimental.cc b/db/experimental.cc index f6f920b2ccb..402dd954089 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -5,13 +5,19 @@ #include "rocksdb/experimental.h" +#include +#include +#include +#include +#include +#include + #include "db/db_impl/db_impl.h" #include "db/version_util.h" #include "logging/logging.h" +#include "util/atomic.h" -namespace ROCKSDB_NAMESPACE { -namespace experimental { - +namespace ROCKSDB_NAMESPACE::experimental { Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end) { @@ -38,8 +44,9 @@ Status UpdateManifestForFilesState( const DBOptions& db_opts, const std::string& db_name, const std::vector& column_families, const UpdateManifestForFilesStateOptions& opts) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; OfflineManifestWriter w(db_opts, db_name); Status s = w.Recover(column_families); @@ -117,7 +124,8 @@ Status UpdateManifestForFilesState( std::unique_ptr db_dir; s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = w.LogAndApply(read_options, cfd, &edit, db_dir.get()); + s = w.LogAndApply(read_options, write_options, cfd, &edit, + db_dir.get()); } if (s.ok()) { ++cfs_updated; @@ -141,5 +149,1063 @@ Status UpdateManifestForFilesState( return s; } -} // namespace experimental -} // namespace ROCKSDB_NAMESPACE +// EXPERIMENTAL new filtering features + +namespace { +void GetFilterInput(FilterInput select, const Slice& key, + const KeySegmentsExtractor::Result& extracted, + Slice* out_input, Slice* out_leadup) { + struct FilterInputGetter { + explicit FilterInputGetter(const Slice& _key, + const KeySegmentsExtractor::Result& _extracted) + : key(_key), extracted(_extracted) {} + const Slice& key; + const KeySegmentsExtractor::Result& extracted; + + Slice operator()(SelectKeySegment select) { + size_t count = extracted.segment_ends.size(); + if (count <= select.segment_index) { + return Slice(); + } + assert(count > 0); + size_t start = select.segment_index > 0 + ? extracted.segment_ends[select.segment_index - 1] + : 0; + size_t end = + extracted + .segment_ends[std::min(size_t{select.segment_index}, count - 1)]; + return Slice(key.data() + start, end - start); + } + + Slice operator()(SelectKeySegmentRange select) { + assert(select.from_segment_index <= select.to_segment_index); + size_t count = extracted.segment_ends.size(); + if (count <= select.from_segment_index) { + return Slice(); + } + assert(count > 0); + size_t start = select.from_segment_index > 0 + ? extracted.segment_ends[select.from_segment_index - 1] + : 0; + size_t end = extracted.segment_ends[std::min( + size_t{select.to_segment_index}, count - 1)]; + return Slice(key.data() + start, end - start); + } + + Slice operator()(SelectWholeKey) { return key; } + + Slice operator()(SelectLegacyKeyPrefix) { + // TODO + assert(false); + return Slice(); + } + + Slice operator()(SelectUserTimestamp) { + // TODO + assert(false); + return Slice(); + } + + Slice operator()(SelectColumnName) { + // TODO + assert(false); + return Slice(); + } + + Slice operator()(SelectValue) { + // TODO + assert(false); + return Slice(); + } + }; + + Slice input = std::visit(FilterInputGetter(key, extracted), select); + *out_input = input; + if (input.empty() || input.data() < key.data() || + input.data() > key.data() + key.size()) { + *out_leadup = key; + } else { + *out_leadup = Slice(key.data(), input.data() - key.data()); + } +} + +const char* DeserializeFilterInput(const char* p, const char* limit, + FilterInput* out) { + if (p >= limit) { + return nullptr; + } + uint8_t b = static_cast(*p++); + if (b & 0x80) { + // Reserved for future use to read more bytes + return nullptr; + } + + switch (b >> 4) { + case 0: + // Various cases that don't have an argument + switch (b) { + case 0: + *out = SelectWholeKey{}; + return p; + case 1: + *out = SelectLegacyKeyPrefix{}; + return p; + case 2: + *out = SelectUserTimestamp{}; + return p; + case 3: + *out = SelectColumnName{}; + return p; + case 4: + *out = SelectValue{}; + return p; + default: + // Reserved for future use + return nullptr; + } + case 1: + // First 16 cases of SelectKeySegment + *out = SelectKeySegment{BitwiseAnd(b, 0xf)}; + return p; + case 2: + // First 16 cases of SelectKeySegmentRange + // that are not a single key segment + // 0: 0-1 + // 1: 0-2 + // 2: 1-2 + // 3: 0-3 + // 4: 1-3 + // 5: 2-3 + // 6: 0-4 + // 7: 1-4 + // 8: 2-4 + // 9: 3-4 + // 10: 0-5 + // 11: 1-5 + // 12: 2-5 + // 13: 3-5 + // 14: 4-5 + // 15: 0-6 + if (b < 6) { + if (b >= 3) { + *out = SelectKeySegmentRange{static_cast(b - 3), 3}; + } else if (b >= 1) { + *out = SelectKeySegmentRange{static_cast(b - 1), 2}; + } else { + *out = SelectKeySegmentRange{0, 1}; + } + } else if (b < 10) { + *out = SelectKeySegmentRange{static_cast(b - 6), 4}; + } else if (b < 15) { + *out = SelectKeySegmentRange{static_cast(b - 10), 5}; + } else { + *out = SelectKeySegmentRange{0, 6}; + } + return p; + default: + // Reserved for future use + return nullptr; + } +} + +void SerializeFilterInput(std::string* out, const FilterInput& select) { + struct FilterInputSerializer { + std::string* out; + void operator()(SelectWholeKey) { out->push_back(0); } + void operator()(SelectLegacyKeyPrefix) { out->push_back(1); } + void operator()(SelectUserTimestamp) { out->push_back(2); } + void operator()(SelectColumnName) { out->push_back(3); } + void operator()(SelectValue) { out->push_back(4); } + void operator()(SelectKeySegment select) { + // TODO: expand supported cases + assert(select.segment_index < 16); + out->push_back(static_cast((1 << 4) | select.segment_index)); + } + void operator()(SelectKeySegmentRange select) { + auto from = select.from_segment_index; + auto to = select.to_segment_index; + // TODO: expand supported cases + assert(from < 6); + assert(to < 6 || (to == 6 && from == 0)); + assert(from < to); + int start = (to - 1) * to / 2; + assert(start + from < 16); + out->push_back(static_cast((2 << 4) | (start + from))); + } + }; + std::visit(FilterInputSerializer{out}, select); +} + +size_t GetFilterInputSerializedLength(const FilterInput& /*select*/) { + // TODO: expand supported cases + return 1; +} + +uint64_t CategorySetToUint(const KeySegmentsExtractor::KeyCategorySet& s) { + static_assert(sizeof(KeySegmentsExtractor::KeyCategorySet) == + sizeof(uint64_t)); + return *reinterpret_cast(&s); +} + +KeySegmentsExtractor::KeyCategorySet UintToCategorySet(uint64_t s) { + static_assert(sizeof(KeySegmentsExtractor::KeyCategorySet) == + sizeof(uint64_t)); + return *reinterpret_cast(&s); +} + +enum BuiltinSstQueryFilters : char { + // Wraps a set of filters such that they use a particular + // KeySegmentsExtractor and a set of categories covering all keys seen. + // TODO: unit test category covering filtering + kExtrAndCatFilterWrapper = 0x1, + + // Wraps a set of filters to limit their scope to a particular set of + // categories. (Unlike kExtrAndCatFilterWrapper, + // keys in other categories may have been seen so are not filtered here.) + // TODO: unit test more subtleties + kCategoryScopeFilterWrapper = 0x2, + + // ... (reserve some values for more wrappers) + + // A filter representing the bytewise min and max values of a numbered + // segment or composite (range of segments). The empty value is tracked + // and filtered independently because it might be a special case that is + // not representative of the minimum in a spread of values. + kBytewiseMinMaxFilter = 0x10, +}; + +class SstQueryFilterBuilder { + public: + virtual ~SstQueryFilterBuilder() = default; + virtual void Add(const Slice& key, + const KeySegmentsExtractor::Result& extracted, + const Slice* prev_key, + const KeySegmentsExtractor::Result* prev_extracted) = 0; + virtual Status GetStatus() const = 0; + virtual size_t GetEncodedLength() const = 0; + virtual void Finish(std::string& append_to) = 0; +}; + +class SstQueryFilterConfigImpl : public SstQueryFilterConfig { + public: + explicit SstQueryFilterConfigImpl( + const FilterInput& input, + const KeySegmentsExtractor::KeyCategorySet& categories) + : input_(input), categories_(categories) {} + + virtual ~SstQueryFilterConfigImpl() = default; + + virtual std::unique_ptr NewBuilder( + bool sanity_checks) const = 0; + + protected: + FilterInput input_; + KeySegmentsExtractor::KeyCategorySet categories_; +}; + +class CategoryScopeFilterWrapperBuilder : public SstQueryFilterBuilder { + public: + explicit CategoryScopeFilterWrapperBuilder( + KeySegmentsExtractor::KeyCategorySet categories, + std::unique_ptr wrapped) + : categories_(categories), wrapped_(std::move(wrapped)) {} + + void Add(const Slice& key, const KeySegmentsExtractor::Result& extracted, + const Slice* prev_key, + const KeySegmentsExtractor::Result* prev_extracted) override { + if (!categories_.Contains(extracted.category)) { + // Category not in scope of the contituent filters + return; + } + wrapped_->Add(key, extracted, prev_key, prev_extracted); + } + + Status GetStatus() const override { return wrapped_->GetStatus(); } + + size_t GetEncodedLength() const override { + size_t wrapped_length = wrapped_->GetEncodedLength(); + if (wrapped_length == 0) { + // Use empty filter + // FIXME: needs unit test + return 0; + } else { + // For now in the code, wraps only 1 filter, but schema supports multiple + return 1 + VarintLength(CategorySetToUint(categories_)) + + VarintLength(1) + wrapped_length; + } + } + + void Finish(std::string& append_to) override { + size_t encoded_length = GetEncodedLength(); + if (encoded_length == 0) { + // Nothing to do + return; + } + size_t old_append_to_size = append_to.size(); + append_to.reserve(old_append_to_size + encoded_length); + append_to.push_back(kCategoryScopeFilterWrapper); + + PutVarint64(&append_to, CategorySetToUint(categories_)); + + // Wrapping just 1 filter for now + PutVarint64(&append_to, 1); + wrapped_->Finish(append_to); + } + + private: + KeySegmentsExtractor::KeyCategorySet categories_; + std::unique_ptr wrapped_; +}; + +class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl { + public: + using SstQueryFilterConfigImpl::SstQueryFilterConfigImpl; + + std::unique_ptr NewBuilder( + bool sanity_checks) const override { + auto b = std::make_unique(*this, sanity_checks); + if (categories_ != KeySegmentsExtractor::KeyCategorySet::All()) { + return std::make_unique(categories_, + std::move(b)); + } else { + return b; + } + } + + static bool RangeMayMatch( + const Slice& filter, const Slice& lower_bound_incl, + const KeySegmentsExtractor::Result& lower_bound_extracted, + const Slice& upper_bound_excl, + const KeySegmentsExtractor::Result& upper_bound_extracted) { + assert(!filter.empty() && filter[0] == kBytewiseMinMaxFilter); + if (filter.size() <= 4) { + // Missing some data + return true; + } + bool empty_included = (filter[1] & kEmptySeenFlag) != 0; + const char* p = filter.data() + 2; + const char* limit = filter.data() + filter.size(); + + FilterInput in; + p = DeserializeFilterInput(p, limit, &in); + if (p == nullptr) { + // Corrupt or unsupported + return true; + } + + uint32_t smallest_size; + p = GetVarint32Ptr(p, limit, &smallest_size); + if (p == nullptr || static_cast(limit - p) <= smallest_size) { + // Corrupt + return true; + } + Slice smallest = Slice(p, smallest_size); + p += smallest_size; + + size_t largest_size = static_cast(limit - p); + Slice largest = Slice(p, largest_size); + + Slice lower_bound_input, lower_bound_leadup; + Slice upper_bound_input, upper_bound_leadup; + GetFilterInput(in, lower_bound_incl, lower_bound_extracted, + &lower_bound_input, &lower_bound_leadup); + GetFilterInput(in, upper_bound_excl, upper_bound_extracted, + &upper_bound_input, &upper_bound_leadup); + + if (lower_bound_leadup.compare(upper_bound_leadup) != 0) { + // Unable to filter range when bounds have different lead-up to key + // segment + return true; + } + + if (empty_included && lower_bound_input.empty()) { + // May match on 0-length segment + return true; + } + // TODO: potentially fix upper bound to actually be exclusive, but it's not + // as simple as changing >= to > below, because it's upper_bound_excl that's + // exclusive, and the upper_bound_input part extracted from it might not be. + + // May match if both the upper bound and lower bound indicate there could + // be overlap + return upper_bound_input.compare(smallest) >= 0 && + lower_bound_input.compare(largest) <= 0; + } + + protected: + struct MyBuilder : public SstQueryFilterBuilder { + MyBuilder(const BytewiseMinMaxSstQueryFilterConfig& _parent, + bool _sanity_checks) + : parent(_parent), sanity_checks(_sanity_checks) {} + + void Add(const Slice& key, const KeySegmentsExtractor::Result& extracted, + const Slice* prev_key, + const KeySegmentsExtractor::Result* prev_extracted) override { + Slice input, leadup; + GetFilterInput(parent.input_, key, extracted, &input, &leadup); + + if (sanity_checks && prev_key && prev_extracted) { + // Opportunistic checking of segment ordering invariant + Slice prev_input, prev_leadup; + GetFilterInput(parent.input_, *prev_key, *prev_extracted, &prev_input, + &prev_leadup); + + int compare = prev_leadup.compare(leadup); + if (compare > 0) { + status = Status::Corruption( + "Ordering invariant violated from 0x" + + prev_key->ToString(/*hex=*/true) + " with prefix 0x" + + prev_leadup.ToString(/*hex=*/true) + " to 0x" + + key.ToString(/*hex=*/true) + " with prefix 0x" + + leadup.ToString(/*hex=*/true)); + return; + } else if (compare == 0) { + // On the same prefix leading up to the segment, the segments must + // not be out of order. + compare = prev_input.compare(input); + if (compare > 0) { + status = Status::Corruption( + "Ordering invariant violated from 0x" + + prev_key->ToString(/*hex=*/true) + " with segment 0x" + + prev_input.ToString(/*hex=*/true) + " to 0x" + + key.ToString(/*hex=*/true) + " with segment 0x" + + input.ToString(/*hex=*/true)); + return; + } + } + } + + // Now actually update state for the filter inputs + // TODO: shorten largest and smallest if appropriate + if (input.empty()) { + empty_seen = true; + } else if (largest.empty()) { + // Step for first non-empty input + smallest = largest = input.ToString(); + } else if (input.compare(largest) > 0) { + largest = input.ToString(); + } else if (input.compare(smallest) < 0) { + smallest = input.ToString(); + } + } + + Status GetStatus() const override { return status; } + + size_t GetEncodedLength() const override { + if (largest.empty()) { + // Not an interesting filter -> 0 to indicate no filter + // FIXME: needs unit test + return 0; + } + return 2 + GetFilterInputSerializedLength(parent.input_) + + VarintLength(smallest.size()) + smallest.size() + largest.size(); + } + + void Finish(std::string& append_to) override { + assert(status.ok()); + size_t encoded_length = GetEncodedLength(); + if (encoded_length == 0) { + // Nothing to do + return; + } + size_t old_append_to_size = append_to.size(); + append_to.reserve(old_append_to_size + encoded_length); + append_to.push_back(kBytewiseMinMaxFilter); + + append_to.push_back(empty_seen ? kEmptySeenFlag : 0); + + SerializeFilterInput(&append_to, parent.input_); + + PutVarint32(&append_to, static_cast(smallest.size())); + append_to.append(smallest); + // The end of `largest` is given by the end of the filter + append_to.append(largest); + assert(append_to.size() == old_append_to_size + encoded_length); + } + + const BytewiseMinMaxSstQueryFilterConfig& parent; + const bool sanity_checks; + // Smallest and largest segment seen, excluding the empty segment which + // is tracked separately + std::string smallest; + std::string largest; + bool empty_seen = false; + + // Only for sanity checks + Status status; + }; + + private: + static constexpr char kEmptySeenFlag = 0x1; +}; + +const SstQueryFilterConfigs kEmptyNotFoundSQFC{}; + +class SstQueryFilterConfigsManagerImpl : public SstQueryFilterConfigsManager { + public: + using ConfigVersionMap = std::map; + + Status Populate(const Data& data) { + if (data.empty()) { + return Status::OK(); + } + // Populate only once + assert(min_ver_ == 0 && max_ver_ == 0); + min_ver_ = max_ver_ = data.begin()->first; + + FilteringVersion prev_ver = 0; + bool first_entry = true; + for (const auto& ver_info : data) { + if (ver_info.first == 0) { + return Status::InvalidArgument( + "Filtering version 0 is reserved for empty configuration and may " + "not be overridden"); + } + if (first_entry) { + min_ver_ = ver_info.first; + first_entry = false; + } else if (ver_info.first != prev_ver + 1) { + return Status::InvalidArgument( + "Filtering versions must increase by 1 without repeating: " + + std::to_string(prev_ver) + " -> " + std::to_string(ver_info.first)); + } + max_ver_ = ver_info.first; + UnorderedSet names_seen_this_ver; + for (const auto& config : ver_info.second) { + if (!names_seen_this_ver.insert(config.first).second) { + return Status::InvalidArgument( + "Duplicate name in filtering version " + + std::to_string(ver_info.first) + ": " + config.first); + } + auto& ver_map = name_map_[config.first]; + ver_map[ver_info.first] = config.second; + if (config.second.extractor) { + extractor_map_[config.second.extractor->GetId()] = + config.second.extractor; + } + } + prev_ver = ver_info.first; + } + return Status::OK(); + } + + struct MyCollector : public TablePropertiesCollector { + // Keeps a reference to `configs` which should be kept alive by + // SstQueryFilterConfigsManagerImpl, which should be kept alive by + // any factories + // TODO: sanity_checks option + explicit MyCollector(const SstQueryFilterConfigs& configs, + const SstQueryFilterConfigsManagerImpl& _parent) + : parent(_parent), + extractor(configs.extractor.get()), + sanity_checks(true) { + for (const auto& c : configs.filters) { + builders.push_back( + static_cast(*c).NewBuilder( + sanity_checks)); + } + } + + Status AddUserKey(const Slice& key, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + // FIXME later: `key` might contain user timestamp. That should be + // exposed properly in a future update to TablePropertiesCollector + KeySegmentsExtractor::Result extracted; + if (extractor) { + extractor->Extract(key, KeySegmentsExtractor::kFullUserKey, &extracted); + if (UNLIKELY(extracted.category >= + KeySegmentsExtractor::kMinErrorCategory)) { + // TODO: proper failure scopes + Status s = Status::Corruption( + "Extractor returned error category from key 0x" + + Slice(key).ToString(/*hex=*/true)); + overall_status.UpdateIfOk(s); + return s; + } + assert(extracted.category <= KeySegmentsExtractor::kMaxUsableCategory); + + bool new_category = categories_seen.Add(extracted.category); + if (sanity_checks) { + // Opportunistic checking of category ordering invariant + if (!first_key) { + if (prev_extracted.category != extracted.category && + !new_category) { + Status s = Status::Corruption( + "Category ordering invariant violated from key 0x" + + Slice(prev_key).ToString(/*hex=*/true) + " to 0x" + + key.ToString(/*hex=*/true)); + overall_status.UpdateIfOk(s); + return s; + } + } + } + } + for (const auto& b : builders) { + if (first_key) { + b->Add(key, extracted, nullptr, nullptr); + } else { + Slice prev_key_slice = Slice(prev_key); + b->Add(key, extracted, &prev_key_slice, &prev_extracted); + } + } + prev_key.assign(key.data(), key.size()); + prev_extracted = std::move(extracted); + first_key = false; + return Status::OK(); + } + Status Finish(UserCollectedProperties* properties) override { + assert(properties != nullptr); + + if (!overall_status.ok()) { + return overall_status; + } + + size_t total_size = 1; + autovector> filters_to_finish; + // Need to determine number of filters before serializing them. Might + // as well determine full length also. + for (const auto& b : builders) { + Status s = b->GetStatus(); + if (s.ok()) { + size_t len = b->GetEncodedLength(); + if (len > 0) { + total_size += VarintLength(len) + len; + filters_to_finish.emplace_back(*b, len); + } + } else { + // FIXME: no way to report partial failure without getting + // remaining filters thrown out + } + } + total_size += VarintLength(filters_to_finish.size()); + if (filters_to_finish.empty()) { + // No filters to add + return Status::OK(); + } + // Length of the last filter is omitted + total_size -= VarintLength(filters_to_finish.back().second); + + // Need to determine size of + // kExtrAndCatFilterWrapper if used + std::string extractor_id; + if (extractor) { + extractor_id = extractor->GetId(); + // identifier byte + total_size += 1; + // fields of the wrapper + total_size += VarintLength(extractor_id.size()) + extractor_id.size() + + VarintLength(CategorySetToUint(categories_seen)); + // outer layer will have just 1 filter in its count (added here) + // and this filter wrapper will have filters_to_finish.size() + // (added above). + total_size += VarintLength(1); + } + + std::string filters; + filters.reserve(total_size); + + // Leave room for drastic changes in the future. + filters.push_back(kSchemaVersion); + + if (extractor) { + // Wrap everything in a kExtrAndCatFilterWrapper + // TODO in future: put whole key filters outside of this wrapper. + // Also TODO in future: order the filters starting with broadest + // applicability. + + // Just one top-level filter (wrapper). Because it's last, we don't + // need to encode its length. + PutVarint64(&filters, 1); + // The filter(s) wrapper itself + filters.push_back(kExtrAndCatFilterWrapper); + PutVarint64(&filters, extractor_id.size()); + filters += extractor_id; + PutVarint64(&filters, CategorySetToUint(categories_seen)); + } + + PutVarint64(&filters, filters_to_finish.size()); + + for (const auto& e : filters_to_finish) { + // Encode filter length, except last filter + if (&e != &filters_to_finish.back()) { + PutVarint64(&filters, e.second); + } + // Encode filter + e.first.Finish(filters); + } + if (filters.size() != total_size) { + assert(false); + return Status::Corruption( + "Internal inconsistency building SST query filters"); + } + + (*properties)[kTablePropertyName] = std::move(filters); + return Status::OK(); + } + UserCollectedProperties GetReadableProperties() const override { + // TODO? + return {}; + } + const char* Name() const override { + // placeholder + return "SstQueryFilterConfigsImpl::MyCollector"; + } + + Status overall_status; + const SstQueryFilterConfigsManagerImpl& parent; + const KeySegmentsExtractor* const extractor; + const bool sanity_checks; + std::vector> builders; + bool first_key = true; + std::string prev_key; + KeySegmentsExtractor::Result prev_extracted; + KeySegmentsExtractor::KeyCategorySet categories_seen; + }; + + struct RangeQueryFilterReader { + Slice lower_bound_incl; + Slice upper_bound_excl; + const KeySegmentsExtractor* extractor; + const UnorderedMap>& + extractor_map; + + struct State { + KeySegmentsExtractor::Result lb_extracted; + KeySegmentsExtractor::Result ub_extracted; + }; + + bool MayMatch_CategoryScopeFilterWrapper(Slice wrapper, + State& state) const { + assert(!wrapper.empty() && wrapper[0] == kCategoryScopeFilterWrapper); + + // Regardless of the filter values (which we assume is not all + // categories; that should skip the wrapper), we need upper bound and + // lower bound to be in the same category to do any range filtering. + // (There could be another category in range between the bounds.) + if (state.lb_extracted.category != state.ub_extracted.category) { + // Can't filter between categories + return true; + } + + const char* p = wrapper.data() + 1; + const char* limit = wrapper.data() + wrapper.size(); + + uint64_t cats_raw; + p = GetVarint64Ptr(p, limit, &cats_raw); + if (p == nullptr) { + // Missing categories + return true; + } + KeySegmentsExtractor::KeyCategorySet categories = + UintToCategorySet(cats_raw); + + // Check category against those in scope + if (!categories.Contains(state.lb_extracted.category)) { + // Can't filter this category + return true; + } + + // Process the wrapped filters + return MayMatch(Slice(p, limit - p), &state); + } + + bool MayMatch_ExtrAndCatFilterWrapper(Slice wrapper) const { + assert(!wrapper.empty() && wrapper[0] == kExtrAndCatFilterWrapper); + if (wrapper.size() <= 4) { + // Missing some data + // (1 byte marker, >= 1 byte name length, >= 1 byte name, >= 1 byte + // categories, ...) + return true; + } + const char* p = wrapper.data() + 1; + const char* limit = wrapper.data() + wrapper.size(); + uint64_t name_len; + p = GetVarint64Ptr(p, limit, &name_len); + if (p == nullptr || name_len == 0 || + static_cast(limit - p) < name_len) { + // Missing some data + return true; + } + Slice name(p, name_len); + p += name_len; + const KeySegmentsExtractor* ex = nullptr; + if (extractor && name == Slice(extractor->GetId())) { + ex = extractor; + } else { + auto it = extractor_map.find(name.ToString()); + if (it != extractor_map.end()) { + ex = it->second.get(); + } else { + // Extractor mismatch / not found + // TODO future: try to get the extractor from the ObjectRegistry + return true; + } + } + + // TODO future: cache extraction? + + // Ready to run extractor + assert(ex); + State state; + ex->Extract(lower_bound_incl, KeySegmentsExtractor::kInclusiveLowerBound, + &state.lb_extracted); + if (UNLIKELY(state.lb_extracted.category >= + KeySegmentsExtractor::kMinErrorCategory)) { + // TODO? Report problem + // No filtering + return true; + } + assert(state.lb_extracted.category <= + KeySegmentsExtractor::kMaxUsableCategory); + + ex->Extract(upper_bound_excl, KeySegmentsExtractor::kExclusiveUpperBound, + &state.ub_extracted); + if (UNLIKELY(state.ub_extracted.category >= + KeySegmentsExtractor::kMinErrorCategory)) { + // TODO? Report problem + // No filtering + return true; + } + assert(state.ub_extracted.category <= + KeySegmentsExtractor::kMaxUsableCategory); + + uint64_t cats_raw; + p = GetVarint64Ptr(p, limit, &cats_raw); + if (p == nullptr) { + // Missing categories + return true; + } + KeySegmentsExtractor::KeyCategorySet categories = + UintToCategorySet(cats_raw); + + // Can only filter out based on category if upper and lower bound have + // the same category. (Each category is contiguous by key order, but we + // don't know the order between categories.) + if (state.lb_extracted.category == state.ub_extracted.category && + !categories.Contains(state.lb_extracted.category)) { + // Filtered out + return false; + } + + // Process the wrapped filters + return MayMatch(Slice(p, limit - p), &state); + } + + bool MayMatch(Slice filters, State* state = nullptr) const { + const char* p = filters.data(); + const char* limit = p + filters.size(); + uint64_t filter_count; + p = GetVarint64Ptr(p, limit, &filter_count); + if (p == nullptr || filter_count == 0) { + // TODO? Report problem + // No filtering + return true; + } + + for (size_t i = 0; i < filter_count; ++i) { + uint64_t filter_len; + if (i + 1 == filter_count) { + // Last filter + filter_len = static_cast(limit - p); + } else { + p = GetVarint64Ptr(p, limit, &filter_len); + if (p == nullptr || filter_len == 0 || + static_cast(limit - p) < filter_len) { + // TODO? Report problem + // No filtering + return true; + } + } + Slice filter = Slice(p, filter_len); + p += filter_len; + bool may_match = true; + char type = filter[0]; + switch (type) { + case kExtrAndCatFilterWrapper: + may_match = MayMatch_ExtrAndCatFilterWrapper(filter); + break; + case kCategoryScopeFilterWrapper: + if (state == nullptr) { + // TODO? Report problem + // No filtering + return true; + } + may_match = MayMatch_CategoryScopeFilterWrapper(filter, *state); + break; + case kBytewiseMinMaxFilter: + if (state == nullptr) { + // TODO? Report problem + // No filtering + return true; + } + may_match = BytewiseMinMaxSstQueryFilterConfig::RangeMayMatch( + filter, lower_bound_incl, state->lb_extracted, upper_bound_excl, + state->ub_extracted); + break; + default: + // TODO? Report problem + {} + // Unknown filter type + } + if (!may_match) { + // Successfully filtered + return false; + } + } + + // Wasn't filtered + return true; + } + }; + + struct MyFactory : public Factory { + explicit MyFactory( + std::shared_ptr _parent, + const std::string& _configs_name) + : parent(std::move(_parent)), + ver_map(parent->GetVerMap(_configs_name)), + configs_name(_configs_name) {} + + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + auto& configs = GetConfigs(); + if (configs.IsEmptyNotFound()) { + return nullptr; + } + return new MyCollector(configs, *parent); + } + const char* Name() const override { + // placeholder + return "SstQueryFilterConfigsManagerImpl::MyFactory"; + } + + Status SetFilteringVersion(FilteringVersion ver) override { + if (ver > 0 && ver < parent->min_ver_) { + return Status::InvalidArgument( + "Filtering version is before earliest known configuration: " + + std::to_string(ver) + " < " + std::to_string(parent->min_ver_)); + } + if (ver > parent->max_ver_) { + return Status::InvalidArgument( + "Filtering version is after latest known configuration: " + + std::to_string(ver) + " > " + std::to_string(parent->max_ver_)); + } + version.StoreRelaxed(ver); + return Status::OK(); + } + FilteringVersion GetFilteringVersion() const override { + return version.LoadRelaxed(); + } + const std::string& GetConfigsName() const override { return configs_name; } + const SstQueryFilterConfigs& GetConfigs() const override { + FilteringVersion ver = version.LoadRelaxed(); + if (ver == 0) { + // Special case + return kEmptyNotFoundSQFC; + } + assert(ver >= parent->min_ver_); + assert(ver <= parent->max_ver_); + auto it = ver_map.upper_bound(ver); + if (it == ver_map.begin()) { + return kEmptyNotFoundSQFC; + } else { + --it; + return it->second; + } + } + + // The buffers pointed to by the Slices must live as long as any read + // operations using this table filter function. + std::function GetTableFilterForRangeQuery( + Slice lower_bound_incl, Slice upper_bound_excl) const override { + // TODO: cache extractor results between SST files, assuming most will + // use the same version + return + [rqf = RangeQueryFilterReader{ + lower_bound_incl, upper_bound_excl, GetConfigs().extractor.get(), + parent->extractor_map_}](const TableProperties& props) -> bool { + auto it = props.user_collected_properties.find(kTablePropertyName); + if (it == props.user_collected_properties.end()) { + // No filtering + return true; + } + auto& filters = it->second; + // Parse the serialized filters string + if (filters.size() < 2 || filters[0] != kSchemaVersion) { + // TODO? Report problem + // No filtering + return true; + } + return rqf.MayMatch(Slice(filters.data() + 1, filters.size() - 1)); + }; + } + + const std::shared_ptr parent; + const ConfigVersionMap& ver_map; + const std::string configs_name; + RelaxedAtomic version; + }; + + Status MakeSharedFactory(const std::string& configs_name, + FilteringVersion ver, + std::shared_ptr* out) const override { + auto obj = std::make_shared( + static_cast_with_check( + shared_from_this()), + configs_name); + Status s = obj->SetFilteringVersion(ver); + if (s.ok()) { + *out = std::move(obj); + } + return s; + } + + const ConfigVersionMap& GetVerMap(const std::string& configs_name) const { + static const ConfigVersionMap kEmptyMap; + auto it = name_map_.find(configs_name); + if (it == name_map_.end()) { + return kEmptyMap; + } + return it->second; + } + + private: + static const std::string kTablePropertyName; + static constexpr char kSchemaVersion = 1; + + private: + UnorderedMap name_map_; + UnorderedMap> + extractor_map_; + FilteringVersion min_ver_ = 0; + FilteringVersion max_ver_ = 0; +}; + +// SstQueryFilterConfigs +const std::string SstQueryFilterConfigsManagerImpl::kTablePropertyName = + "rocksdb.sqfc"; +} // namespace + +bool SstQueryFilterConfigs::IsEmptyNotFound() const { + return this == &kEmptyNotFoundSQFC; +} + +std::shared_ptr MakeSharedBytewiseMinMaxSQFC( + FilterInput input, KeySegmentsExtractor::KeyCategorySet categories) { + return std::make_shared(input, + categories); +} + +Status SstQueryFilterConfigsManager::MakeShared( + const Data& data, std::shared_ptr* out) { + auto obj = std::make_shared(); + Status s = obj->Populate(data); + if (s.ok()) { + *out = std::move(obj); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE::experimental diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index 749a172ac60..0d260fbf5ce 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -9,9 +9,12 @@ #include "db/version_edit.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/options.h" #include "rocksdb/sst_file_writer.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/defer.h" #include "util/random.h" #include "utilities/fault_injection_env.h" @@ -244,7 +247,7 @@ class ChecksumVerifyHelper { public: ChecksumVerifyHelper(Options& options) : options_(options) {} - ~ChecksumVerifyHelper() {} + ~ChecksumVerifyHelper() = default; Status GetSingleFileChecksumAndFuncName( const std::string& file_path, std::string* file_checksum, @@ -472,7 +475,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { std::vector live_files; dbfull()->GetLiveFilesMetaData(&live_files); std::set set1; - for (auto f : live_files) { + for (const auto& f : live_files) { set1.insert(f.name); ASSERT_EQ(f.file_checksum, kUnknownFileChecksum); ASSERT_EQ(f.file_checksum_func_name, kUnknownFileChecksumFuncName); @@ -521,7 +524,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_OK(s) << s.ToString(); std::vector live_files1; dbfull()->GetLiveFilesMetaData(&live_files1); - for (auto f : live_files1) { + for (const auto& f : live_files1) { if (set1.find(f.name) == set1.end()) { ASSERT_EQ(f.file_checksum, file_checksum2); ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name2); @@ -538,7 +541,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_OK(s) << s.ToString(); std::vector live_files2; dbfull()->GetLiveFilesMetaData(&live_files2); - for (auto f : live_files2) { + for (const auto& f : live_files2) { if (set1.find(f.name) == set1.end()) { ASSERT_EQ(f.file_checksum, file_checksum3); ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name3); @@ -561,7 +564,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_OK(s) << s.ToString(); std::vector live_files3; dbfull()->GetLiveFilesMetaData(&live_files3); - for (auto f : live_files3) { + for (const auto& f : live_files3) { if (set1.find(f.name) == set1.end()) { ASSERT_FALSE(f.file_checksum == file_checksum4); ASSERT_EQ(f.file_checksum, "asd"); @@ -581,7 +584,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_OK(s) << s.ToString(); std::vector live_files4; dbfull()->GetLiveFilesMetaData(&live_files4); - for (auto f : live_files4) { + for (const auto& f : live_files4) { if (set1.find(f.name) == set1.end()) { std::string cur_checksum5, cur_checksum_func_name5; ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( @@ -603,7 +606,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_OK(s) << s.ToString(); std::vector live_files6; dbfull()->GetLiveFilesMetaData(&live_files6); - for (auto f : live_files6) { + for (const auto& f : live_files6) { if (set1.find(f.name) == set1.end()) { ASSERT_EQ(f.file_checksum, file_checksum6); ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name6); @@ -1093,7 +1096,7 @@ TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) { size_t total_fadvised_bytes = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SstFileWriter::Rep::InvalidatePageCache", [&](void* arg) { - size_t fadvise_size = *(reinterpret_cast(arg)); + size_t fadvise_size = *(static_cast(arg)); total_fadvised_bytes += fadvise_size; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -1292,6 +1295,80 @@ TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) { Destroy(options); } +TEST_F(ExternalSSTFileBasicTest, ReadOldValueOfIngestedKeyBug) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.disable_auto_compactions = true; + options.num_levels = 3; + options.preserve_internal_time_seconds = 36000; + DestroyAndReopen(options); + + // To create the following LSM tree to trigger the bug: + // L0 + // L1 with seqno [1, 2] + // L2 with seqno [3, 4] + + // To create L1 shape + ASSERT_OK( + db_->Put(WriteOptions(), db_->DefaultColumnFamily(), "k1", "seqno1")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK( + db_->Put(WriteOptions(), db_->DefaultColumnFamily(), "k1", "seqno2")); + ASSERT_OK(db_->Flush(FlushOptions())); + ColumnFamilyMetaData meta_1; + db_->GetColumnFamilyMetaData(&meta_1); + auto& files_1 = meta_1.levels[0].files; + ASSERT_EQ(files_1.size(), 2); + std::string file1 = files_1[0].db_path + files_1[0].name; + std::string file2 = files_1[1].db_path + files_1[1].name; + ASSERT_OK(db_->CompactFiles(CompactionOptions(), {file1, file2}, 1)); + // To confirm L1 shape + ColumnFamilyMetaData meta_2; + db_->GetColumnFamilyMetaData(&meta_2); + ASSERT_EQ(meta_2.levels[0].files.size(), 0); + ASSERT_EQ(meta_2.levels[1].files.size(), 1); + // Seqno starts from non-zero due to seqno reservation for + // preserve_internal_time_seconds greater than 0; + ASSERT_EQ(meta_2.levels[1].files[0].largest_seqno, 102); + ASSERT_EQ(meta_2.levels[2].files.size(), 0); + // To create L2 shape + ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), "k2overlap", + "old_value")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), "k2overlap", + "old_value")); + ASSERT_OK(db_->Flush(FlushOptions())); + ColumnFamilyMetaData meta_3; + db_->GetColumnFamilyMetaData(&meta_3); + auto& files_3 = meta_3.levels[0].files; + std::string file3 = files_3[0].db_path + files_3[0].name; + std::string file4 = files_3[1].db_path + files_3[1].name; + ASSERT_OK(db_->CompactFiles(CompactionOptions(), {file3, file4}, 2)); + // To confirm L2 shape + ColumnFamilyMetaData meta_4; + db_->GetColumnFamilyMetaData(&meta_4); + ASSERT_EQ(meta_4.levels[0].files.size(), 0); + ASSERT_EQ(meta_4.levels[1].files.size(), 1); + ASSERT_EQ(meta_4.levels[2].files.size(), 1); + ASSERT_EQ(meta_4.levels[2].files[0].largest_seqno, 104); + + // Ingest a file with new value of the key "k2overlap" + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string f = sst_files_dir_ + "f.sst"; + ASSERT_OK(sst_file_writer.Open(f)); + ASSERT_OK(sst_file_writer.Put("k2overlap", "new_value")); + ExternalSstFileInfo f_info; + ASSERT_OK(sst_file_writer.Finish(&f_info)); + ASSERT_OK(db_->IngestExternalFile({f}, IngestExternalFileOptions())); + + // To verify new value of the key "k2overlap" is correctly returned + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), "k2overlap", &value)); + // Before the fix, the value would be "old_value" and assertion failed + ASSERT_EQ(value, "new_value"); +} + TEST_F(ExternalSSTFileBasicTest, IngestRangeDeletionTombstoneWithGlobalSeqno) { for (int i = 5; i < 25; i++) { ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i), @@ -1556,7 +1633,7 @@ TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) { bool change_checksum_called = false; const auto& change_checksum = [&](void* arg) { if (!change_checksum_called) { - char* buf = reinterpret_cast(arg); + char* buf = static_cast(arg); assert(nullptr != buf); buf[0] ^= 0x1; change_checksum_called = true; @@ -1653,10 +1730,10 @@ TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) { uint64_t props_block_offset = 0; size_t props_block_size = 0; const auto& get_props_block_offset = [&](void* arg) { - props_block_offset = *reinterpret_cast(arg); + props_block_offset = *static_cast(arg); }; const auto& get_props_block_size = [&](void* arg) { - props_block_size = *reinterpret_cast(arg); + props_block_size = *static_cast(arg); }; SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -1785,100 +1862,166 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) { } TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) { - Options options = CurrentOptions(); - const ImmutableCFOptions ioptions(options); - options.bottommost_temperature = Temperature::kWarm; - SstFileWriter sst_file_writer(EnvOptions(), options); - options.level0_file_num_compaction_trigger = 2; - Reopen(options); + // Rather than doubling the running time of this test, this boolean + // field gets a random starting value and then alternates between + // true and false. + bool alternate_hint = Random::GetTLSInstance()->OneIn(2); + Destroy(CurrentOptions()); - auto size = GetSstSizeHelper(Temperature::kUnknown); - ASSERT_EQ(size, 0); - size = GetSstSizeHelper(Temperature::kWarm); - ASSERT_EQ(size, 0); - size = GetSstSizeHelper(Temperature::kHot); - ASSERT_EQ(size, 0); + for (std::string mode : {"ingest_behind", "fail_if_not", "neither"}) { + SCOPED_TRACE("Mode: " + mode); - // create file01.sst (1000 => 1099) and ingest it - std::string file1 = sst_files_dir_ + "file01.sst"; - ASSERT_OK(sst_file_writer.Open(file1)); - for (int k = 1000; k < 1100; k++) { - ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + Options options = CurrentOptions(); + + auto test_fs = + std::make_shared(options.env->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(options.env, test_fs)); + options.env = env.get(); + + const ImmutableCFOptions ioptions(options); + options.last_level_temperature = Temperature::kCold; + options.default_write_temperature = Temperature::kHot; + SstFileWriter sst_file_writer(EnvOptions(), options); + options.level0_file_num_compaction_trigger = 2; + options.allow_ingest_behind = (mode == "ingest_behind"); + Reopen(options); + Defer destroyer([&]() { Destroy(options); }); + +#define VERIFY_SST_COUNT(temp, expected_count_in_db, \ + expected_count_outside_db) \ + { \ + /* Partially verify against FileSystem */ \ + ASSERT_EQ( \ + test_fs->CountCurrentSstFilesWithTemperature(temp), \ + size_t{expected_count_in_db} + size_t{expected_count_outside_db}); \ + /* Partially verify against DB manifest */ \ + if (expected_count_in_db == 0) { \ + ASSERT_EQ(GetSstSizeHelper(temp), 0); \ + } else { \ + ASSERT_GE(GetSstSizeHelper(temp), 1); \ + } \ } - ExternalSstFileInfo file1_info; - Status s = sst_file_writer.Finish(&file1_info); - ASSERT_OK(s); - ASSERT_EQ(file1_info.file_path, file1); - ASSERT_EQ(file1_info.num_entries, 100); - ASSERT_EQ(file1_info.smallest_key, Key(1000)); - ASSERT_EQ(file1_info.largest_key, Key(1099)); - std::vector files; - std::vector files_checksums; - std::vector files_checksum_func_names; - Temperature file_temperature = Temperature::kWarm; - - files.push_back(file1); - IngestExternalFileOptions in_opts; - in_opts.move_files = false; - in_opts.snapshot_consistency = true; - in_opts.allow_global_seqno = false; - in_opts.allow_blocking_flush = false; - in_opts.write_global_seqno = true; - in_opts.verify_file_checksum = false; - IngestExternalFileArg arg; - arg.column_family = db_->DefaultColumnFamily(); - arg.external_files = files; - arg.options = in_opts; - arg.files_checksums = files_checksums; - arg.files_checksum_func_names = files_checksum_func_names; - arg.file_temperature = file_temperature; - s = db_->IngestExternalFiles({arg}); - ASSERT_OK(s); + size_t ex_unknown_in_db = 0; + size_t ex_hot_in_db = 0; + size_t ex_warm_in_db = 0; + size_t ex_cold_in_db = 0; + size_t ex_unknown_outside_db = 0; + size_t ex_hot_outside_db = 0; + size_t ex_warm_outside_db = 0; + size_t ex_cold_outside_db = 0; +#define VERIFY_SST_COUNTS() \ + { \ + VERIFY_SST_COUNT(Temperature::kUnknown, ex_unknown_in_db, \ + ex_unknown_outside_db); \ + VERIFY_SST_COUNT(Temperature::kHot, ex_hot_in_db, ex_hot_outside_db); \ + VERIFY_SST_COUNT(Temperature::kWarm, ex_warm_in_db, ex_warm_outside_db); \ + VERIFY_SST_COUNT(Temperature::kCold, ex_cold_in_db, ex_cold_outside_db); \ + } - // check the temperature of the file being ingested - ColumnFamilyMetaData metadata; - db_->GetColumnFamilyMetaData(&metadata); - ASSERT_EQ(1, metadata.file_count); - ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature); - size = GetSstSizeHelper(Temperature::kUnknown); - ASSERT_EQ(size, 0); - size = GetSstSizeHelper(Temperature::kWarm); - ASSERT_GT(size, 1); + // Create sst file, using a name recognized by FileTemperatureTestFS and + // specified temperature + std::string file1 = sst_files_dir_ + "9000000.sst"; + ASSERT_OK(sst_file_writer.Open(file1, Temperature::kWarm)); + for (int k = 1000; k < 1100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s); - // non-bottommost file still has unknown temperature - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("bar", "bar")); - ASSERT_OK(Flush()); - db_->GetColumnFamilyMetaData(&metadata); - ASSERT_EQ(2, metadata.file_count); - ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); - size = GetSstSizeHelper(Temperature::kUnknown); - ASSERT_GT(size, 0); - size = GetSstSizeHelper(Temperature::kWarm); - ASSERT_GT(size, 0); + ex_warm_outside_db++; + VERIFY_SST_COUNTS(); + + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(1000)); + ASSERT_EQ(file1_info.largest_key, Key(1099)); + + std::vector files; + std::vector files_checksums; + std::vector files_checksum_func_names; + + files.push_back(file1); + IngestExternalFileOptions in_opts; + in_opts.move_files = false; + in_opts.snapshot_consistency = true; + in_opts.allow_global_seqno = false; + in_opts.allow_blocking_flush = false; + in_opts.write_global_seqno = true; + in_opts.verify_file_checksum = false; + in_opts.ingest_behind = (mode == "ingest_behind"); + in_opts.fail_if_not_bottommost_level = (mode == "fail_if_not"); + IngestExternalFileArg arg; + arg.column_family = db_->DefaultColumnFamily(); + arg.external_files = files; + arg.options = in_opts; + arg.files_checksums = files_checksums; + arg.files_checksum_func_names = files_checksum_func_names; + alternate_hint = !alternate_hint; + if (alternate_hint) { + // Provide correct hint (for optimal file open performance) + arg.file_temperature = Temperature::kWarm; + } else { + // No hint (also works because ingestion will read the temperature + // according to storage) + arg.file_temperature = Temperature::kUnknown; + } + s = db_->IngestExternalFiles({arg}); + ASSERT_OK(s); - // reopen and check the information is persisted - Reopen(options); - db_->GetColumnFamilyMetaData(&metadata); - ASSERT_EQ(2, metadata.file_count); - ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); - ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature); - size = GetSstSizeHelper(Temperature::kUnknown); - ASSERT_GT(size, 0); - size = GetSstSizeHelper(Temperature::kWarm); - ASSERT_GT(size, 0); + // check the temperature of the file ingested (copied) + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); - // check other non-exist temperatures - size = GetSstSizeHelper(Temperature::kHot); - ASSERT_EQ(size, 0); - size = GetSstSizeHelper(Temperature::kCold); - ASSERT_EQ(size, 0); - std::string prop; - ASSERT_TRUE(dbfull()->GetProperty( - DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22), - &prop)); - ASSERT_EQ(std::atoi(prop.c_str()), 0); + if (mode != "neither") { + ASSERT_EQ(Temperature::kCold, metadata.levels[6].files[0].temperature); + ex_cold_in_db++; + } else { + // Currently, we are only able to use last_level_temperature for ingestion + // when using an ingestion option that guarantees ingestion to last level. + ASSERT_EQ(Temperature::kHot, metadata.levels[6].files[0].temperature); + ex_hot_in_db++; + } + VERIFY_SST_COUNTS(); + + // non-bottommost file still has kHot temperature + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kHot, metadata.levels[0].files[0].temperature); + + ex_hot_in_db++; + VERIFY_SST_COUNTS(); + + // reopen and check the information is persisted + Reopen(options); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kHot, metadata.levels[0].files[0].temperature); + if (mode != "neither") { + ASSERT_EQ(Temperature::kCold, metadata.levels[6].files[0].temperature); + } else { + ASSERT_EQ(Temperature::kHot, metadata.levels[6].files[0].temperature); + } + + // (no change) + VERIFY_SST_COUNTS(); + + // check invalid temperature with DB property. Not sure why the original + // author is testing this case, but perhaps so that downgrading DB with + // new GetProperty code using a new Temperature will report something + // reasonable and not an error. + std::string prop; + ASSERT_TRUE(dbfull()->GetProperty( + DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22), + &prop)); + ASSERT_EQ(std::atoi(prop.c_str()), 0); +#undef VERIFY_SST_COUNT + } } TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevel) { diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 3cc4d6752e0..587e283625f 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). - #include "db/external_sst_file_ingestion_job.h" #include @@ -18,12 +17,11 @@ #include "file/random_access_file_reader.h" #include "logging/logging.h" #include "table/merging_iterator.h" -#include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" #include "table/table_builder.h" #include "table/unique_id_impl.h" #include "test_util/sync_point.h" -#include "util/stop_watch.h" +#include "util/udt_util.h" namespace ROCKSDB_NAMESPACE { @@ -38,6 +36,8 @@ Status ExternalSstFileIngestionJob::Prepare( // Read the information of files we are ingesting for (const std::string& file_path : external_files_paths) { IngestedFileInfo file_to_ingest; + // For temperature, first assume it matches provided hint + file_to_ingest.file_temperature = file_temperature; status = GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv); if (!status.ok()) { @@ -91,13 +91,16 @@ Status ExternalSstFileIngestionJob::Prepare( } } - // Hanlde the file temperature - for (size_t i = 0; i < num_files; i++) { - files_to_ingest_[i].file_temperature = file_temperature; + if (ingestion_options_.ingest_behind && files_overlap_) { + return Status::NotSupported( + "Files with overlapping ranges cannot be ingested with ingestion " + "behind mode."); } - if (ingestion_options_.ingest_behind && files_overlap_) { - return Status::NotSupported("Files have overlapping ranges"); + if (ucmp->timestamp_size() > 0 && files_overlap_) { + return Status::NotSupported( + "Files with overlapping ranges cannot be ingested to column " + "family with user-defined timestamp enabled."); } // Copy/Move external files into DB @@ -144,7 +147,7 @@ Status ExternalSstFileIngestionJob::Prepare( // Original file is on a different FS, use copy instead of hard linking. f.copy_file = true; ROCKS_LOG_INFO(db_options_.info_log, - "Triy to link file %s but it's not supported : %s", + "Tried to link file %s but it's not supported : %s", path_outside_db.c_str(), status.ToString().c_str()); } } else { @@ -154,10 +157,25 @@ Status ExternalSstFileIngestionJob::Prepare( if (f.copy_file) { TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile", nullptr); - // CopyFile also sync the new file. - status = - CopyFile(fs_.get(), path_outside_db, path_inside_db, 0, - db_options_.use_fsync, io_tracer_, Temperature::kUnknown); + // Always determining the destination temperature from the ingested-to + // level would be difficult because in general we only find out the level + // ingested to later, during Run(). + // However, we can guarantee "last level" temperature for when the user + // requires ingestion to the last level. + Temperature dst_temp = + (ingestion_options_.ingest_behind || + ingestion_options_.fail_if_not_bottommost_level) + ? sv->mutable_cf_options.last_level_temperature + : sv->mutable_cf_options.default_write_temperature; + // Note: CopyFile also syncs the new file. + status = CopyFile(fs_.get(), path_outside_db, f.file_temperature, + path_inside_db, dst_temp, 0, db_options_.use_fsync, + io_tracer_); + // The destination of the copy will be ingested + f.file_temperature = dst_temp; + } else { + // Note: we currently assume that linking files does not cross + // temperatures, so no need to change f.file_temperature } TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded"); if (!status.ok()) { @@ -190,7 +208,7 @@ Status ExternalSstFileIngestionJob::Prepare( // Generate and check the sst file checksum. Note that, if // IngestExternalFileOptions::write_global_seqno is true, we will not update // the checksum information in the files_to_ingests_ here, since the file is - // upadted with the new global_seqno. After global_seqno is updated, DB will + // updated with the new global_seqno. After global_seqno is updated, DB will // generate the new checksum and store it in the Manifest. In all other cases // if ingestion_options_.write_global_seqno == true and // verify_file_checksum is false, we only check the checksum function name. @@ -301,8 +319,7 @@ Status ExternalSstFileIngestionJob::Prepare( } } } else if (files_checksums.size() != files_checksum_func_names.size() || - (files_checksums.size() == files_checksum_func_names.size() && - files_checksums.size() != 0)) { + files_checksums.size() != 0) { // The checksum or checksum function name vector are not both empty // and they are incomplete. status = Status::InvalidArgument( @@ -318,59 +335,32 @@ Status ExternalSstFileIngestionJob::Prepare( } } - // TODO: The following is duplicated with Cleanup(). - if (!status.ok()) { - IOOptions io_opts; - // We failed, remove all files that we copied into the db - for (IngestedFileInfo& f : files_to_ingest_) { - if (f.internal_file_path.empty()) { - continue; - } - Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); - if (!s.ok()) { - ROCKS_LOG_WARN(db_options_.info_log, - "AddFile() clean up for file %s failed : %s", - f.internal_file_path.c_str(), s.ToString().c_str()); - } - } - } - return status; } Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, SuperVersion* super_version) { - autovector ranges; - autovector keys; - size_t ts_sz = cfd_->user_comparator()->timestamp_size(); - if (ts_sz) { - // Check all ranges [begin, end] inclusively. Add maximum - // timestamp to include all `begin` keys, and add minimal timestamp to - // include all `end` keys. - for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { - std::string begin_str; - std::string end_str; - AppendUserKeyWithMaxTimestamp( - &begin_str, file_to_ingest.smallest_internal_key.user_key(), ts_sz); - AppendUserKeyWithMinTimestamp( - &end_str, file_to_ingest.largest_internal_key.user_key(), ts_sz); - keys.emplace_back(std::move(begin_str)); - keys.emplace_back(std::move(end_str)); - } - for (size_t i = 0; i < files_to_ingest_.size(); ++i) { - ranges.emplace_back(keys[2 * i], keys[2 * i + 1]); - } - } else { - for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { - ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(), - file_to_ingest.largest_internal_key.user_key()); - } + size_t n = files_to_ingest_.size(); + autovector ranges; + ranges.reserve(n); + for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { + ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(), + file_to_ingest.largest_internal_key.user_key()); } Status status = cfd_->RangesOverlapWithMemtables( ranges, super_version, db_options_.allow_data_in_errors, flush_needed); - if (status.ok() && *flush_needed && - !ingestion_options_.allow_blocking_flush) { - status = Status::InvalidArgument("External file requires flush"); + if (status.ok() && *flush_needed) { + if (!ingestion_options_.allow_blocking_flush) { + status = Status::InvalidArgument("External file requires flush"); + } + auto ucmp = cfd_->user_comparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + status = Status::InvalidArgument( + "Column family enables user-defined timestamps, please make " + "sure the key range (without timestamp) of external file does not " + "overlap with key range in the memtables."); + } } return status; } @@ -389,7 +379,7 @@ Status ExternalSstFileIngestionJob::Run() { return status; } if (need_flush) { - return Status::TryAgain(); + return Status::TryAgain("need_flush"); } assert(status.ok() && need_flush == false); #endif @@ -433,26 +423,26 @@ Status ExternalSstFileIngestionJob::Run() { if (!status.ok()) { return status; } - if (smallest_parsed.sequence == 0) { + if (smallest_parsed.sequence == 0 && assigned_seqno != 0) { UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno, smallest_parsed.type); } - if (largest_parsed.sequence == 0) { + if (largest_parsed.sequence == 0 && assigned_seqno != 0) { UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno, largest_parsed.type); } status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno); + if (!status.ok()) { + return status; + } TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run", &assigned_seqno); + assert(assigned_seqno == 0 || assigned_seqno == last_seqno + 1); if (assigned_seqno > last_seqno) { - assert(assigned_seqno == last_seqno + 1); last_seqno = assigned_seqno; ++consumed_seqno_count_; } - if (!status.ok()) { - return status; - } status = GenerateChecksumForIngestedFile(&f); if (!status.ok()) { @@ -487,8 +477,7 @@ Status ExternalSstFileIngestionJob::Run() { ? kReservedEpochNumberForFileIngestedBehind : cfd_->NewEpochNumber(), f.file_checksum, f.file_checksum_func_name, f.unique_id, 0, tail_size, - static_cast( - f.table_properties.user_defined_timestamps_persisted)); + f.user_defined_timestamps_persisted); f_metadata.temperature = f.file_temperature; edit_.AddFile(f.picked_level, f_metadata); } @@ -539,7 +528,8 @@ void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() { , LLONG_MAX /* max compaction bytes, not applicable */, 0 /* output path ID, not applicable */, mutable_cf_options.compression, - mutable_cf_options.compression_opts, Temperature::kUnknown, + mutable_cf_options.compression_opts, + mutable_cf_options.default_write_temperature, 0 /* max_subcompaction, not applicable */, {} /* grandparents, not applicable */, false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */, @@ -633,17 +623,7 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) { if (!status.ok()) { // We failed to add the files to the database // remove all the files we copied - for (IngestedFileInfo& f : files_to_ingest_) { - if (f.internal_file_path.empty()) { - continue; - } - Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); - if (!s.ok()) { - ROCKS_LOG_WARN(db_options_.info_log, - "AddFile() clean up for file %s failed : %s", - f.internal_file_path.c_str(), s.ToString().c_str()); - } - } + DeleteInternalFiles(); consumed_seqno_count_ = 0; files_overlap_ = false; } else if (status.ok() && ingestion_options_.move_files) { @@ -661,38 +641,44 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) { } } -Status ExternalSstFileIngestionJob::GetIngestedFileInfo( - const std::string& external_file, uint64_t new_file_number, - IngestedFileInfo* file_to_ingest, SuperVersion* sv) { - file_to_ingest->external_file_path = external_file; - - // Get external file size - Status status = fs_->GetFileSize(external_file, IOOptions(), - &file_to_ingest->file_size, nullptr); - if (!status.ok()) { - return status; +void ExternalSstFileIngestionJob::DeleteInternalFiles() { + IOOptions io_opts; + for (IngestedFileInfo& f : files_to_ingest_) { + if (f.internal_file_path.empty()) { + continue; + } + Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } } +} - // Assign FD with number - file_to_ingest->fd = - FileDescriptor(new_file_number, 0, file_to_ingest->file_size); - - // Create TableReader for external file - std::unique_ptr table_reader; +Status ExternalSstFileIngestionJob::ResetTableReader( + const std::string& external_file, uint64_t new_file_number, + bool user_defined_timestamps_persisted, SuperVersion* sv, + IngestedFileInfo* file_to_ingest, + std::unique_ptr* table_reader) { std::unique_ptr sst_file; - std::unique_ptr sst_file_reader; - - status = - fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr); + FileOptions fo{env_options_}; + fo.temperature = file_to_ingest->file_temperature; + Status status = + fs_->NewRandomAccessFile(external_file, fo, &sst_file, nullptr); if (!status.ok()) { return status; } - sst_file_reader.reset(new RandomAccessFileReader( - std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_)); - - // TODO(yuzhangyu): User-defined timestamps doesn't support external sst file - // ingestion. Pass in the correct `user_defined_timestamps_persisted` flag - // for creating `TableReaderOptions` when the support is there. + Temperature updated_temp = sst_file->GetTemperature(); + if (updated_temp != Temperature::kUnknown && + updated_temp != file_to_ingest->file_temperature) { + // The hint was missing or wrong. Track temperature reported by storage. + file_to_ingest->file_temperature = updated_temp; + } + std::unique_ptr sst_file_reader( + new RandomAccessFileReader(std::move(sst_file), external_file, + nullptr /*Env*/, io_tracer_)); + table_reader->reset(); status = cfd_->ioptions()->table_factory->NewTableReader( TableReaderOptions( *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, @@ -702,28 +688,20 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( /*force_direct_prefetch*/ false, /*level*/ -1, /*block_cache_tracer*/ nullptr, /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(), - /*cur_file_num*/ new_file_number), - std::move(sst_file_reader), file_to_ingest->file_size, &table_reader); - if (!status.ok()) { - return status; - } - - if (ingestion_options_.verify_checksums_before_ingest) { - // If customized readahead size is needed, we can pass a user option - // all the way to here. Right now we just rely on the default readahead - // to keep things simple. - // TODO: plumb Env::IOActivity - ReadOptions ro; - ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; - status = table_reader->VerifyChecksum( - ro, TableReaderCaller::kExternalSSTIngestion); - if (!status.ok()) { - return status; - } - } + /*cur_file_num*/ new_file_number, + /* unique_id */ {}, /* largest_seqno */ 0, + /* tail_size */ 0, user_defined_timestamps_persisted), + std::move(sst_file_reader), file_to_ingest->file_size, table_reader); + return status; +} +Status ExternalSstFileIngestionJob::SanityCheckTableProperties( + const std::string& external_file, uint64_t new_file_number, + SuperVersion* sv, IngestedFileInfo* file_to_ingest, + std::unique_ptr* table_reader) { // Get the external file properties - auto props = table_reader->GetTableProperties(); + auto props = table_reader->get()->GetTableProperties(); + assert(props.get()); const auto& uprops = props->user_collected_properties; // Get table version @@ -761,12 +739,101 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( } else { return Status::InvalidArgument("External file version is not supported"); } + + file_to_ingest->cf_id = static_cast(props->column_family_id); + // This assignment works fine even though `table_reader` may later be reset, + // since that will not affect how table properties are parsed, and this + // assignment is making a copy. + file_to_ingest->table_properties = *props; + // Get number of entries in table file_to_ingest->num_entries = props->num_entries; file_to_ingest->num_range_deletions = props->num_range_deletions; + // Validate table properties related to comparator name and user defined + // timestamps persisted flag. + file_to_ingest->user_defined_timestamps_persisted = + static_cast(props->user_defined_timestamps_persisted); + bool mark_sst_file_has_no_udt = false; + Status s = ValidateUserDefinedTimestampsOptions( + cfd_->user_comparator(), props->comparator_name, + cfd_->ioptions()->persist_user_defined_timestamps, + file_to_ingest->user_defined_timestamps_persisted, + &mark_sst_file_has_no_udt); + if (s.ok() && mark_sst_file_has_no_udt) { + // A column family that enables user-defined timestamps in Memtable only + // feature can also ingest external files created by a setting that disables + // user-defined timestamps. In that case, we need to re-mark the + // user_defined_timestamps_persisted flag for the file. + file_to_ingest->user_defined_timestamps_persisted = false; + } else if (!s.ok()) { + return s; + } + + // `TableReader` is initialized with `user_defined_timestamps_persisted` flag + // to be true. If its value changed to false after this sanity check, we + // need to reset the `TableReader`. + auto ucmp = cfd_->user_comparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0 && + !file_to_ingest->user_defined_timestamps_persisted) { + s = ResetTableReader(external_file, new_file_number, + file_to_ingest->user_defined_timestamps_persisted, sv, + file_to_ingest, table_reader); + } + return s; +} + +Status ExternalSstFileIngestionJob::GetIngestedFileInfo( + const std::string& external_file, uint64_t new_file_number, + IngestedFileInfo* file_to_ingest, SuperVersion* sv) { + file_to_ingest->external_file_path = external_file; + + // Get external file size + Status status = fs_->GetFileSize(external_file, IOOptions(), + &file_to_ingest->file_size, nullptr); + if (!status.ok()) { + return status; + } + + // Assign FD with number + file_to_ingest->fd = + FileDescriptor(new_file_number, 0, file_to_ingest->file_size); + + // Create TableReader for external file + std::unique_ptr table_reader; + // Initially create the `TableReader` with flag + // `user_defined_timestamps_persisted` to be true since that's the most common + // case + status = ResetTableReader(external_file, new_file_number, + /*user_defined_timestamps_persisted=*/true, sv, + file_to_ingest, &table_reader); + if (!status.ok()) { + return status; + } + + status = SanityCheckTableProperties(external_file, new_file_number, sv, + file_to_ingest, &table_reader); + if (!status.ok()) { + return status; + } + + if (ingestion_options_.verify_checksums_before_ingest) { + // If customized readahead size is needed, we can pass a user option + // all the way to here. Right now we just rely on the default readahead + // to keep things simple. + // TODO: plumb Env::IOActivity, Env::IOPriority + ReadOptions ro; + ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; + status = table_reader->VerifyChecksum( + ro, TableReaderCaller::kExternalSSTIngestion); + if (!status.ok()) { + return status; + } + } + ParsedInternalKey key; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; std::unique_ptr iter(table_reader->NewIterator( ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, @@ -837,7 +904,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( table_reader->NewRangeTombstoneIterator(ro)); // We may need to adjust these key bounds, depending on whether any range // deletion tombstones extend past them. - const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); + const Comparator* ucmp = cfd_->user_comparator(); if (range_del_iter != nullptr) { for (range_del_iter->SeekToFirst(); range_del_iter->Valid(); range_del_iter->Next()) { @@ -865,13 +932,11 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( } } - file_to_ingest->cf_id = static_cast(props->column_family_id); - - file_to_ingest->table_properties = *props; - - auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id, - props->orig_file_number, - &(file_to_ingest->unique_id)); + auto s = + GetSstInternalUniqueId(file_to_ingest->table_properties.db_id, + file_to_ingest->table_properties.db_session_id, + file_to_ingest->table_properties.orig_file_number, + &(file_to_ingest->unique_id)); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Failed to get SST unique id for file %s", @@ -888,23 +953,26 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( SequenceNumber* assigned_seqno) { Status status; *assigned_seqno = 0; - if (force_global_seqno) { + auto ucmp = cfd_->user_comparator(); + const size_t ts_sz = ucmp->timestamp_size(); + if (force_global_seqno || files_overlap_) { *assigned_seqno = last_seqno + 1; - if (compaction_style == kCompactionStyleUniversal || files_overlap_) { + // If files overlap, we have to ingest them at level 0. + if (files_overlap_) { + assert(ts_sz == 0); + file_to_ingest->picked_level = 0; if (ingestion_options_.fail_if_not_bottommost_level) { status = Status::TryAgain( "Files cannot be ingested to Lmax. Please make sure key range of " "Lmax does not overlap with files to ingest."); - return status; } - file_to_ingest->picked_level = 0; return status; } } bool overlap_with_db = false; Arena arena; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; ro.total_order_seek = true; int target_level = 0; @@ -939,26 +1007,6 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( overlap_with_db = true; break; } - - if (compaction_style == kCompactionStyleUniversal && lvl != 0) { - const std::vector& level_files = - vstorage->LevelFiles(lvl); - const SequenceNumber level_largest_seqno = - (*std::max_element(level_files.begin(), level_files.end(), - [](FileMetaData* f1, FileMetaData* f2) { - return f1->fd.largest_seqno < - f2->fd.largest_seqno; - })) - ->fd.largest_seqno; - // should only assign seqno to current level's largest seqno when - // the file fits - if (level_largest_seqno != 0 && - IngestedFileFitInLevel(file_to_ingest, lvl)) { - *assigned_seqno = level_largest_seqno; - } else { - continue; - } - } } else if (compaction_style == kCompactionStyleUniversal) { continue; } @@ -969,12 +1017,6 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( target_level = lvl; } } - // If files overlap, we have to ingest them at level 0 and assign the newest - // sequence number - if (files_overlap_) { - target_level = 0; - *assigned_seqno = last_seqno + 1; - } if (ingestion_options_.fail_if_not_bottommost_level && target_level < cfd_->NumberLevels() - 1) { @@ -989,8 +1031,16 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile", &overlap_with_db); file_to_ingest->picked_level = target_level; - if (overlap_with_db && *assigned_seqno == 0) { - *assigned_seqno = last_seqno + 1; + if (overlap_with_db) { + if (ts_sz > 0) { + status = Status::InvalidArgument( + "Column family enables user-defined timestamps, please make sure the " + "key range (without timestamp) of external file does not overlap " + "with key range (without timestamp) in the db"); + } + if (*assigned_seqno == 0) { + *assigned_seqno = last_seqno + 1; + } } return status; } @@ -998,12 +1048,12 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile( IngestedFileInfo* file_to_ingest) { auto* vstorage = cfd_->current()->storage_info(); - // First, check if new files fit in the bottommost level - int bottom_lvl = cfd_->NumberLevels() - 1; - if (!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) { + // First, check if new files fit in the last level + int last_lvl = cfd_->NumberLevels() - 1; + if (!IngestedFileFitInLevel(file_to_ingest, last_lvl)) { return Status::InvalidArgument( "Can't ingest_behind file as it doesn't fit " - "at the bottommost level!"); + "at the last level!"); } // Second, check if despite allow_ingest_behind=true we still have 0 seqnums @@ -1018,7 +1068,7 @@ Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile( } } - file_to_ingest->picked_level = bottom_lvl; + file_to_ingest->picked_level = last_lvl; return Status::OK(); } @@ -1029,7 +1079,8 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( return Status::OK(); } else if (!ingestion_options_.allow_global_seqno) { return Status::InvalidArgument("Global seqno is required, but disabled"); - } else if (file_to_ingest->global_seqno_offset == 0) { + } else if (ingestion_options_.write_global_seqno && + file_to_ingest->global_seqno_offset == 0) { return Status::InvalidArgument( "Trying to set global seqno for a file that don't have a global seqno " "field"); @@ -1101,8 +1152,8 @@ IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile( if (!io_s.ok()) { return io_s; } - file_to_ingest->file_checksum = file_checksum; - file_to_ingest->file_checksum_func_name = file_checksum_func_name; + file_to_ingest->file_checksum = std::move(file_checksum); + file_to_ingest->file_checksum_func_name = std::move(file_checksum_func_name); return IOStatus::OK(); } diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h index 49bb1e31e59..16b7fefbe9b 100644 --- a/db/external_sst_file_ingestion_job.h +++ b/db/external_sst_file_ingestion_job.h @@ -43,7 +43,7 @@ struct IngestedFileInfo { uint64_t num_entries; // total number of range deletions in external file uint64_t num_range_deletions; - // Id of column family this file shoule be ingested into + // Id of column family this file should be ingested into uint32_t cf_id; // TableProperties read from external file TableProperties table_properties; @@ -73,6 +73,14 @@ struct IngestedFileInfo { Temperature file_temperature = Temperature::kUnknown; // Unique id of the file to be ingested UniqueId64x2 unique_id{}; + // Whether the external file should be treated as if it has user-defined + // timestamps or not. If this flag is false, and the column family enables + // UDT feature, the file will have min-timestamp artificially padded to its + // user keys when it's read. Since it will affect how `TableReader` reads a + // table file, it's defaulted to optimize for the majority of the case where + // the user key's format in the external file matches the column family's + // setting. + bool user_defined_timestamps_persisted = true; }; class ExternalSstFileIngestionJob { @@ -102,16 +110,7 @@ class ExternalSstFileIngestionJob { assert(directories != nullptr); } - ~ExternalSstFileIngestionJob() { - for (const auto& c : file_ingesting_compactions_) { - cfd_->compaction_picker()->UnregisterCompaction(c); - delete c; - } - - for (const auto& f : compaction_input_metdatas_) { - delete f; - } - } + ~ExternalSstFileIngestionJob() { UnregisterRange(); } // Prepare the job by copying external files into the DB. Status Prepare(const std::vector& external_files_paths, @@ -156,10 +155,27 @@ class ExternalSstFileIngestionJob { return files_to_ingest_; } - // How many sequence numbers did we consume as part of the ingest job? + // How many sequence numbers did we consume as part of the ingestion job? int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; } private: + Status ResetTableReader(const std::string& external_file, + uint64_t new_file_number, + bool user_defined_timestamps_persisted, + SuperVersion* sv, IngestedFileInfo* file_to_ingest, + std::unique_ptr* table_reader); + + // Read the external file's table properties to do various sanity checks and + // populates certain fields in `IngestedFileInfo` according to some table + // properties. + // In some cases when sanity check passes, `table_reader` could be reset with + // different options. For example: when external file does not contain + // timestamps while column family enables UDT in Memtables only feature. + Status SanityCheckTableProperties(const std::string& external_file, + uint64_t new_file_number, SuperVersion* sv, + IngestedFileInfo* file_to_ingest, + std::unique_ptr* table_reader); + // Open the external file and populate `file_to_ingest` with all the // external information we need to ingest this file. Status GetIngestedFileInfo(const std::string& external_file, @@ -203,6 +219,9 @@ class ExternalSstFileIngestionJob { // compactions. void CreateEquivalentFileIngestingCompactions(); + // Remove all the internal files created, called when ingestion job fails. + void DeleteInternalFiles(); + SystemClock* clock_; FileSystemPtr fs_; VersionSet* versions_; diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index ef4ab7fa58a..64f8a03d46d 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -92,7 +92,7 @@ class ExternalSSTFileTest : public ExternalSSTFileTestBase, public ::testing::WithParamInterface> { public: - ExternalSSTFileTest() {} + ExternalSSTFileTest() = default; Status GenerateOneExternalFile( const Options& options, ColumnFamilyHandle* cfh, @@ -295,6 +295,25 @@ class ExternalSSTFileTest int last_file_id_ = 0; }; +TEST_F(ExternalSSTFileTest, ComparatorMismatch) { + Options options = CurrentOptions(); + Options options_diff_ucmp = options; + + options.comparator = BytewiseComparator(); + options_diff_ucmp.comparator = ReverseBytewiseComparator(); + + SstFileWriter sst_file_writer(EnvOptions(), options_diff_ucmp); + + std::string file = sst_files_dir_ + "file.sst"; + ASSERT_OK(sst_file_writer.Open(file)); + ASSERT_OK(sst_file_writer.Put("foo", "val")); + ASSERT_OK(sst_file_writer.Put("bar", "val1")); + ASSERT_OK(sst_file_writer.Finish()); + + DestroyAndReopen(options); + ASSERT_NOK(DeprecatedAddFile({file})); +} + TEST_F(ExternalSSTFileTest, Basic) { do { Options options = CurrentOptions(); @@ -832,7 +851,7 @@ TEST_F(ExternalSSTFileTest, AddList) { TablePropertiesCollection props; ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); ASSERT_EQ(props.size(), 2); - for (auto file_props : props) { + for (const auto& file_props : props) { auto user_props = file_props.second->user_collected_properties; ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES"); ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES"); @@ -855,7 +874,7 @@ TEST_F(ExternalSSTFileTest, AddList) { ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); ASSERT_EQ(props.size(), 3); - for (auto file_props : props) { + for (const auto& file_props : props) { auto user_props = file_props.second->user_collected_properties; ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES"); ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES"); @@ -1714,9 +1733,8 @@ TEST_F(ExternalSSTFileTest, WithUnorderedWrite) { {"DBImpl::WaitForPendingWrites:BeforeBlock", "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}}); SyncPoint::GetInstance()->SetCallBack( - "DBImpl::IngestExternalFile:NeedFlush", [&](void* need_flush) { - ASSERT_TRUE(*reinterpret_cast(need_flush)); - }); + "DBImpl::IngestExternalFile:NeedFlush", + [&](void* need_flush) { ASSERT_TRUE(*static_cast(need_flush)); }); Options options = CurrentOptions(); options.unordered_write = true; @@ -1848,6 +1866,92 @@ TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) { VerifyDBFromMap(true_data, &kcnt, false); } +TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedUniversal) { + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + Options options = CurrentOptions(); + options.num_levels = 5; + options.compaction_style = kCompactionStyleUniversal; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + std::vector> file_data; + std::map true_data; + + // Write 200 -> 250 into the bottommost level + for (int i = 200; i <= 250; i++) { + ASSERT_OK(Put(Key(i), "bottommost")); + true_data[Key(i)] = "bottommost"; + } + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,1", FilesPerLevel()); + + // Take a snapshot to enforce global sequence number. + const Snapshot* snap = db_->GetSnapshot(); + + // Insert 100 -> 200 into the memtable + for (int i = 100; i <= 200; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + + // Insert 0 -> 20 using AddFile + file_data.clear(); + for (int i = 0; i <= 20; i++) { + file_data.emplace_back(Key(i), "L4"); + } + + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + + // This file don't overlap with anything in the DB, will go to L4 + ASSERT_EQ("0,0,0,0,2", FilesPerLevel()); + + // Insert 80 -> 130 using AddFile + file_data.clear(); + for (int i = 80; i <= 130; i++) { + file_data.emplace_back(Key(i), "L0"); + } + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + + // This file overlap with the memtable, so it will flush it and add + // it self to L0 + ASSERT_EQ("2,0,0,0,2", FilesPerLevel()); + + // Insert 30 -> 50 using AddFile + file_data.clear(); + for (int i = 30; i <= 50; i++) { + file_data.emplace_back(Key(i), "L4"); + } + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + + // This file don't overlap with anything in the DB and fit in L4 as well + ASSERT_EQ("2,0,0,0,3", FilesPerLevel()); + + // Insert 10 -> 40 using AddFile + file_data.clear(); + for (int i = 10; i <= 40; i++) { + file_data.emplace_back(Key(i), "L3"); + } + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + + // This file overlap with files in L4, we will ingest it into the last + // non-overlapping and non-empty level, in this case, it's L0. + ASSERT_EQ("3,0,0,0,3", FilesPerLevel()); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + db_->ReleaseSnapshot(snap); +} + TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) { Options options = CurrentOptions(); DestroyAndReopen(options); @@ -2878,7 +2982,7 @@ TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) { // currently at the front of the 2nd writer queue. We must make // sure that it won't enter the 2nd writer queue for the second time. std::vector> data; - data.push_back(std::make_pair("1001", "v2")); + data.emplace_back("1001", "v2"); ASSERT_OK(GenerateAndAddExternalFile(options, data, -1, true)); } @@ -2978,6 +3082,531 @@ TEST_P(ExternalSSTFileTest, delete iter; } +class ExternalSSTFileWithTimestampTest : public ExternalSSTFileTest { + public: + ExternalSSTFileWithTimestampTest() = default; + + static const std::string kValueNotFound; + static const std::string kTsNotFound; + + std::string EncodeAsUint64(uint64_t v) { + std::string dst; + PutFixed64(&dst, v); + return dst; + } + + Status IngestExternalUDTFile(const std::vector& files, + bool allow_global_seqno = true) { + IngestExternalFileOptions opts; + opts.snapshot_consistency = true; + opts.allow_global_seqno = allow_global_seqno; + return db_->IngestExternalFile(files, opts); + } + + void VerifyValueAndTs(const std::string& key, + const std::string& read_timestamp, + const std::string& expected_value, + const std::string& expected_timestamp) { + Slice read_ts = read_timestamp; + ReadOptions read_options; + read_options.timestamp = &read_ts; + std::string value; + std::string timestamp; + Status s = db_->Get(read_options, key, &value, ×tamp); + if (s.ok()) { + ASSERT_EQ(value, expected_value); + ASSERT_EQ(timestamp, expected_timestamp); + } else if (s.IsNotFound()) { + ASSERT_EQ(kValueNotFound, expected_value); + ASSERT_EQ(kTsNotFound, expected_timestamp); + } else { + assert(false); + } + } +}; + +const std::string ExternalSSTFileWithTimestampTest::kValueNotFound = + "NOT_FOUND"; +const std::string ExternalSSTFileWithTimestampTest::kTsNotFound = + "NOT_FOUND_TS"; + +TEST_F(ExternalSSTFileWithTimestampTest, Basic) { + do { + Options options = CurrentOptions(); + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.persist_user_defined_timestamps = true; + + DestroyAndReopen(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open + // a file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + // file1.sst [0, 50) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 50; k++) { + // write 3 versions of values for each key, write newer version first + // they are treated as logically smaller by the comparator. + for (int version = 3; version > 0; version--) { + ASSERT_OK( + sst_file_writer.Put(Key(k), EncodeAsUint64(k + version), + Key(k) + "_val" + std::to_string(version))); + } + } + + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + // sst_file_writer already finished, cannot add this value + ASSERT_NOK(sst_file_writer.Put(Key(100), EncodeAsUint64(1), "bad_val")); + + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 150); + ASSERT_EQ(file1_info.smallest_key, Key(0) + EncodeAsUint64(0 + 3)); + ASSERT_EQ(file1_info.largest_key, Key(49) + EncodeAsUint64(49 + 1)); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + // Add file using file path + ASSERT_OK(IngestExternalUDTFile({file1})); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + + for (int k = 0; k < 50; k++) { + for (int version = 3; version > 0; version--) { + VerifyValueAndTs(Key(k), EncodeAsUint64(k + version), + Key(k) + "_val" + std::to_string(version), + EncodeAsUint64(k + version)); + } + } + + // file2.sst [50, 200) + // Put [key=k, ts=k, value=k_val] for k in [50, 200) + // RangeDelete[start_key=75, end_key=125, ts=100] + std::string file2 = sst_files_dir_ + "file2.sst"; + int range_del_begin = 75, range_del_end = 125, range_del_ts = 100; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 50; k < 200; k++) { + ASSERT_OK( + sst_file_writer.Put(Key(k), EncodeAsUint64(k), Key(k) + "_val")); + if (k == range_del_ts) { + ASSERT_OK(sst_file_writer.DeleteRange( + Key(range_del_begin), Key(range_del_end), EncodeAsUint64(k))); + } + } + + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 150); + ASSERT_EQ(file2_info.smallest_key, Key(50) + EncodeAsUint64(50)); + ASSERT_EQ(file2_info.largest_key, Key(199) + EncodeAsUint64(199)); + ASSERT_EQ(file2_info.num_range_del_entries, 1); + ASSERT_EQ(file2_info.smallest_range_del_key, + Key(range_del_begin) + EncodeAsUint64(range_del_ts)); + ASSERT_EQ(file2_info.largest_range_del_key, + Key(range_del_end) + EncodeAsUint64(range_del_ts)); + // Add file using file path + ASSERT_OK(IngestExternalUDTFile({file2})); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + + for (int k = 50; k < 200; k++) { + if (k < range_del_begin || k >= range_del_end) { + VerifyValueAndTs(Key(k), EncodeAsUint64(k), Key(k) + "_val", + EncodeAsUint64(k)); + } + // else { + // // FIXME(yuzhangyu): when range tombstone and point data has the + // // same seq, on read path, make range tombstone overrides point + // // data if it has a newer user-defined timestamp. This is how + // // we determine point data's overriding relationship, so we + // // should keep it consistent. + // VerifyValueAndTs(Key(k), EncodeAsUint64(k), Key(k) + "_val", + // EncodeAsUint64(k)); + // VerifyValueAndTs(Key(k), EncodeAsUint64(range_del_ts), + // kValueNotFound, + // kTsNotFound); + // } + } + + // file3.sst [100, 200), key range overlap with db + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 100; k < 200; k++) { + ASSERT_OK( + sst_file_writer.Put(Key(k), EncodeAsUint64(k + 1), Key(k) + "_val1")); + } + ExternalSstFileInfo file3_info; + ASSERT_OK(sst_file_writer.Finish(&file3_info)); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 100); + ASSERT_EQ(file3_info.smallest_key, Key(100) + EncodeAsUint64(101)); + ASSERT_EQ(file3_info.largest_key, Key(199) + EncodeAsUint64(200)); + + // Allowing ingesting a file containing overlap key range with the db is + // not safe without verifying the overlapped key has a higher timestamp + // than what the db contains, so we do not allow this regardless of + // whether global sequence number is allowed. + ASSERT_NOK(IngestExternalUDTFile({file2})); + ASSERT_NOK(IngestExternalUDTFile({file2}, /*allow_global_seqno*/ false)); + + // Write [0, 50) + // Write to DB newer versions to cover ingested data and move sequence + // number forward. + for (int k = 0; k < 50; k++) { + ASSERT_OK(dbfull()->Put(WriteOptions(), Key(k), EncodeAsUint64(k + 4), + Key(k) + "_val" + std::to_string(4))); + } + + // Read all 4 versions (3 from ingested, 1 from live writes). + for (int k = 0; k < 50; k++) { + for (int version = 4; version > 0; version--) { + VerifyValueAndTs(Key(k), EncodeAsUint64(k + version), + Key(k) + "_val" + std::to_string(version), + EncodeAsUint64(k + version)); + } + } + SequenceNumber seq_num_before_ingestion = db_->GetLatestSequenceNumber(); + ASSERT_GT(seq_num_before_ingestion, 0U); + + // file4.sst [200, 250) + std::string file4 = sst_files_dir_ + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + for (int k = 200; k < 250; k++) { + ASSERT_OK( + sst_file_writer.Put(Key(k), EncodeAsUint64(k), Key(k) + "_val")); + } + + ExternalSstFileInfo file4_info; + ASSERT_OK(sst_file_writer.Finish(&file4_info)); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file4_info.file_path, file4); + ASSERT_EQ(file4_info.num_entries, 50); + ASSERT_EQ(file4_info.smallest_key, Key(200) + EncodeAsUint64(200)); + ASSERT_EQ(file4_info.largest_key, Key(249) + EncodeAsUint64(249)); + ASSERT_EQ(file4_info.num_range_del_entries, 0); + ASSERT_EQ(file4_info.smallest_range_del_key, ""); + ASSERT_EQ(file4_info.largest_range_del_key, ""); + + ASSERT_OK(IngestExternalUDTFile({file4})); + + // In UDT mode, any external file that can be successfully ingested also + // should not overlap with the db. As a result, they can always get the + // seq 0 assigned. + ASSERT_EQ(db_->GetLatestSequenceNumber(), seq_num_before_ingestion); + + DestroyAndRecreateExternalSSTFilesDir(); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction | + kRangeDelSkipConfigs)); +} + +TEST_F(ExternalSSTFileWithTimestampTest, SanityCheck) { + Options options = CurrentOptions(); + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.persist_user_defined_timestamps = true; + DestroyAndReopen(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file1.sst [0, 100) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), EncodeAsUint64(k), Key(k) + "_val")); + } + + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + + // file2.sst [50, 75) + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 50; k < 75; k++) { + ASSERT_OK( + sst_file_writer.Put(Key(k), EncodeAsUint64(k + 2), Key(k) + "_val")); + } + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + + // Cannot ingest when files' user key range overlaps. There is no + // straightforward way to assign sequence number to the files so that they + // meet the user-defined timestamps invariant: for the same user provided key, + // the entry with a higher sequence number should not have a smaller + // timestamp. In this case: file1 has (key=k, ts=k) for k in [50, 75), + // file2 has (key=k, ts=k+2) for k in [50, 75). + // The invariant is only met if file2 is ingested after file1. In other cases + // when user key ranges are interleaved in files, no order of ingestion can + // guarantee this invariant. So we do not allow ingesting files with + // overlapping key ranges. + ASSERT_TRUE(IngestExternalUDTFile({file1, file2}).IsNotSupported()); + + options.allow_ingest_behind = true; + DestroyAndReopen(options); + IngestExternalFileOptions opts; + + // TODO(yuzhangyu): support ingestion behind for user-defined timestamps? + // Ingesting external files with user-defined timestamps requires searching + // through the whole lsm tree to make sure there is no key range overlap with + // the db. Ingestion behind currently is doing a simply placing it at the + // bottom level step without a search, so we don't allow it either. + opts.ingest_behind = true; + ASSERT_TRUE(db_->IngestExternalFile({file1}, opts).IsNotSupported()); + + DestroyAndRecreateExternalSSTFilesDir(); +} + +TEST_F(ExternalSSTFileWithTimestampTest, UDTSettingsCompatibilityCheck) { + Options options = CurrentOptions(); + Options disable_udt_options = options; + Options not_persist_udt_options = options; + Options persist_udt_options = options; + disable_udt_options.comparator = BytewiseComparator(); + not_persist_udt_options.comparator = + test::BytewiseComparatorWithU64TsWrapper(); + not_persist_udt_options.persist_user_defined_timestamps = false; + not_persist_udt_options.allow_concurrent_memtable_write = false; + persist_udt_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + persist_udt_options.persist_user_defined_timestamps = true; + + EnvOptions env_options = EnvOptions(); + + SstFileWriter disable_udt_sst_writer(env_options, disable_udt_options); + SstFileWriter not_persist_udt_sst_writer(env_options, + not_persist_udt_options); + SstFileWriter persist_udt_sst_writer(env_options, persist_udt_options); + + // File1: [0, 50), contains no timestamps + // comparator name: leveldb.BytewiseComparator + // user_defined_timestamps_persisted: true + std::string disable_udt_sst_file = sst_files_dir_ + "file1.sst"; + ASSERT_OK(disable_udt_sst_writer.Open(disable_udt_sst_file)); + for (int k = 0; k < 50; k++) { + ASSERT_NOK( + disable_udt_sst_writer.Put(Key(k), EncodeAsUint64(1), Key(k) + "_val")); + ASSERT_OK(disable_udt_sst_writer.Put(Key(k), Key(k) + "_val")); + } + ASSERT_OK(disable_udt_sst_writer.Finish()); + + // File2: [50, 100), contains no timestamps + // comparator name: leveldb.BytewiseComparator.u64ts + // user_defined_timestamps_persisted: false + std::string not_persist_udt_sst_file = sst_files_dir_ + "file2.sst"; + ASSERT_OK(not_persist_udt_sst_writer.Open(not_persist_udt_sst_file)); + for (int k = 50; k < 100; k++) { + ASSERT_NOK(not_persist_udt_sst_writer.Put(Key(k), Key(k) + "_val")); + ASSERT_NOK(not_persist_udt_sst_writer.Put(Key(k), EncodeAsUint64(k), + Key(k) + "_val")); + ASSERT_OK(not_persist_udt_sst_writer.Put(Key(k), EncodeAsUint64(0), + Key(k) + "_val")); + } + ASSERT_OK(not_persist_udt_sst_writer.Finish()); + + // File3: [100, 150), contains timestamp + // comparator name: leveldb.BytewiseComparator.u64ts + // user_defined_timestamps_persisted: true + std::string persist_udt_sst_file = sst_files_dir_ + "file3.sst"; + ASSERT_OK(persist_udt_sst_writer.Open(persist_udt_sst_file)); + for (int k = 100; k < 150; k++) { + ASSERT_NOK(persist_udt_sst_writer.Put(Key(k), Key(k) + "_val")); + ASSERT_OK( + persist_udt_sst_writer.Put(Key(k), EncodeAsUint64(k), Key(k) + "_val")); + } + ASSERT_OK(persist_udt_sst_writer.Finish()); + + DestroyAndReopen(disable_udt_options); + ASSERT_OK( + IngestExternalUDTFile({disable_udt_sst_file, not_persist_udt_sst_file})); + ASSERT_NOK(IngestExternalUDTFile({persist_udt_sst_file})); + for (int k = 0; k < 100; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + DestroyAndReopen(not_persist_udt_options); + ASSERT_OK( + IngestExternalUDTFile({disable_udt_sst_file, not_persist_udt_sst_file})); + ASSERT_NOK(IngestExternalUDTFile({persist_udt_sst_file})); + for (int k = 0; k < 100; k++) { + VerifyValueAndTs(Key(k), EncodeAsUint64(0), Key(k) + "_val", + EncodeAsUint64(0)); + } + + DestroyAndReopen(persist_udt_options); + ASSERT_NOK( + IngestExternalUDTFile({disable_udt_sst_file, not_persist_udt_sst_file})); + ASSERT_OK(IngestExternalUDTFile({persist_udt_sst_file})); + for (int k = 100; k < 150; k++) { + VerifyValueAndTs(Key(k), EncodeAsUint64(k), Key(k) + "_val", + EncodeAsUint64(k)); + } + + DestroyAndRecreateExternalSSTFilesDir(); +} + +TEST_F(ExternalSSTFileWithTimestampTest, TimestampsNotPersistedBasic) { + do { + Options options = CurrentOptions(); + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.persist_user_defined_timestamps = false; + options.allow_concurrent_memtable_write = false; + + DestroyAndReopen(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file1.sst [0, 50) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 50; k++) { + // Attempting to write 2 versions of values for each key, only the version + // with timestamp 0 goes through. + for (int version = 1; version >= 0; version--) { + if (version == 1) { + ASSERT_NOK( + sst_file_writer.Put(Key(k), EncodeAsUint64(version), + Key(k) + "_val" + std::to_string(version))); + } else { + ASSERT_OK( + sst_file_writer.Put(Key(k), EncodeAsUint64(version), + Key(k) + "_val" + std::to_string(version))); + } + } + } + + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + // sst_file_writer already finished, cannot add this value + ASSERT_NOK(sst_file_writer.Put(Key(100), EncodeAsUint64(0), "bad_val")); + + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 50); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(49)); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + // Add file using file path + ASSERT_OK(IngestExternalUDTFile({file1})); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + + // Read ingested file: all data contain minimum timestamps. + for (int k = 0; k < 50; k++) { + VerifyValueAndTs(Key(k), EncodeAsUint64(0), + Key(k) + "_val" + std::to_string(0), EncodeAsUint64(0)); + } + + // file2.sst [50, 200) + // Put [key=k, ts=0, value=k_val0] for k in [50, 200) + // RangeDelete[start_key=75, end_key=125, ts=0] + std::string file2 = sst_files_dir_ + "file2.sst"; + int range_del_begin = 75, range_del_end = 125; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 50; k < 200; k++) { + // All these timestamps will later be effectively 0 + ASSERT_OK( + sst_file_writer.Put(Key(k), EncodeAsUint64(0), Key(k) + "_val0")); + } + ASSERT_OK(sst_file_writer.DeleteRange( + Key(range_del_begin), Key(range_del_end), EncodeAsUint64(0))); + + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 150); + ASSERT_EQ(file2_info.smallest_key, Key(50)); + ASSERT_EQ(file2_info.largest_key, Key(199)); + ASSERT_EQ(file2_info.num_range_del_entries, 1); + ASSERT_EQ(file2_info.smallest_range_del_key, Key(range_del_begin)); + ASSERT_EQ(file2_info.largest_range_del_key, Key(range_del_end)); + // Add file using file path + ASSERT_OK(IngestExternalUDTFile({file2})); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + + // Range deletion covering point data in the same file is over-written. + for (int k = 50; k < 200; k++) { + VerifyValueAndTs(Key(k), EncodeAsUint64(0), Key(k) + "_val0", + EncodeAsUint64(0)); + } + + // file3.sst [100, 200), key range overlap with db + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 100; k < 200; k++) { + ASSERT_OK( + sst_file_writer.Put(Key(k), EncodeAsUint64(0), Key(k) + "_val0")); + } + ExternalSstFileInfo file3_info; + ASSERT_OK(sst_file_writer.Finish(&file3_info)); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 100); + ASSERT_EQ(file3_info.smallest_key, Key(100)); + ASSERT_EQ(file3_info.largest_key, Key(199)); + + // In UDT mode, file with overlapping key range cannot be ingested. + ASSERT_NOK(IngestExternalUDTFile({file3})); + ASSERT_NOK(IngestExternalUDTFile({file3}, /*allow_global_seqno*/ false)); + + // Write [0, 50) + // Write to DB newer versions to cover ingested data and move sequence + // number forward. + for (int k = 0; k < 50; k++) { + for (int version = 1; version < 3; version++) { + ASSERT_OK(dbfull()->Put(WriteOptions(), Key(k), EncodeAsUint64(version), + Key(k) + "_val" + std::to_string(version))); + } + } + + // Read three versions (1 from ingested, 2 from live writes) + for (int k = 0; k < 50; k++) { + for (int version = 0; version < 3; version++) { + VerifyValueAndTs(Key(k), EncodeAsUint64(version), + Key(k) + "_val" + std::to_string(version), + EncodeAsUint64(version)); + } + } + SequenceNumber seq_num_before_ingestion = db_->GetLatestSequenceNumber(); + ASSERT_GT(seq_num_before_ingestion, 0U); + + // file4.sst [200, 250) + std::string file4 = sst_files_dir_ + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + for (int k = 200; k < 250; k++) { + ASSERT_OK( + sst_file_writer.Put(Key(k), EncodeAsUint64(0), Key(k) + "_val")); + } + + ExternalSstFileInfo file4_info; + ASSERT_OK(sst_file_writer.Finish(&file4_info)); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file4_info.file_path, file4); + ASSERT_EQ(file4_info.num_entries, 50); + ASSERT_EQ(file4_info.smallest_key, Key(200)); + ASSERT_EQ(file4_info.largest_key, Key(249)); + ASSERT_EQ(file4_info.num_range_del_entries, 0); + ASSERT_EQ(file4_info.smallest_range_del_key, ""); + ASSERT_EQ(file4_info.largest_range_del_key, ""); + + ASSERT_OK(IngestExternalUDTFile({file4})); + + // Ingested files do not overlap with db, they can always have global seqno + // 0 assigned. + ASSERT_EQ(db_->GetLatestSequenceNumber(), seq_num_before_ingestion); + + DestroyAndRecreateExternalSSTFilesDir(); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction | + kRangeDelSkipConfigs)); +} + INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest, testing::Values(std::make_tuple(false, false), std::make_tuple(false, true), diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index d888dfde104..17b4c034283 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -572,7 +572,7 @@ TEST_P(FaultInjectionTest, NoDuplicateTrailingEntries) { edit.SetColumnFamily(0); std::string buf; assert(edit.EncodeTo(&buf)); - const Status s = log_writer->AddRecord(buf); + const Status s = log_writer->AddRecord(WriteOptions(), buf); ASSERT_NOK(s); } diff --git a/db/flush_job.cc b/db/flush_job.cc index 0b60a4bbd0d..5dda03844ae 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -100,9 +100,9 @@ FlushJob::FlushJob( Statistics* stats, EventLogger* event_logger, bool measure_io_stats, const bool sync_output_directory, const bool write_manifest, Env::Priority thread_pri, const std::shared_ptr& io_tracer, - const SeqnoToTimeMapping& seqno_to_time_mapping, const std::string& db_id, - const std::string& db_session_id, std::string full_history_ts_low, - BlobFileCompletionCallback* blob_callback) + std::shared_ptr seqno_to_time_mapping, + const std::string& db_id, const std::string& db_session_id, + std::string full_history_ts_low, BlobFileCompletionCallback* blob_callback) : dbname_(dbname), db_id_(db_id), db_session_id_(db_session_id), @@ -136,7 +136,7 @@ FlushJob::FlushJob( clock_(db_options_.clock), full_history_ts_low_(std::move(full_history_ts_low)), blob_callback_(blob_callback), - db_impl_seqno_to_time_mapping_(seqno_to_time_mapping) { + seqno_to_time_mapping_(std::move(seqno_to_time_mapping)) { // Update the thread status to indicate flush. ReportStartedFlush(); TEST_SYNC_POINT("FlushJob::FlushJob()"); @@ -417,7 +417,7 @@ Status FlushJob::MemPurge() { // Create two iterators, one for the memtable data (contains // info from puts + deletes), and one for the memtable // Range Tombstones (from DeleteRanges). - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; ro.total_order_seek = true; Arena arena; @@ -425,7 +425,8 @@ Status FlushJob::MemPurge() { std::vector> range_del_iters; for (MemTable* m : mems_) { - memtables.push_back(m->NewIterator(ro, &arena)); + memtables.push_back( + m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); auto* range_del_iter = m->NewRangeTombstoneIterator( ro, kMaxSequenceNumber, true /* immutable_memtable */); if (range_del_iter != nullptr) { @@ -447,7 +448,7 @@ Status FlushJob::MemPurge() { : earliest_seqno; } - ScopedArenaIterator iter( + ScopedArenaPtr iter( NewMergingIterator(&(cfd_->internal_comparator()), memtables.data(), static_cast(memtables.size()), &arena)); @@ -709,8 +710,8 @@ bool FlushJob::MemPurgeDecider(double threshold) { // Cochran formula for determining sample size. // 95% confidence interval, 7% precision. // n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0 - // TODO: plumb Env::IOActivity double n0 = 196.0; + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; ro.total_order_seek = true; @@ -858,19 +859,14 @@ Status FlushJob::WriteLevel0Table() { const uint64_t start_cpu_micros = clock_->CPUMicros(); Status s; - SequenceNumber smallest_seqno = mems_.front()->GetEarliestSequenceNumber(); - if (!db_impl_seqno_to_time_mapping_.Empty()) { - // make a local copy, as the seqno_to_time_mapping from db_impl is not - // thread safe, which will be used while not holding the db_mutex. - seqno_to_time_mapping_ = - db_impl_seqno_to_time_mapping_.Copy(smallest_seqno); - } + meta_.temperature = mutable_cf_options_.default_write_temperature; + file_options_.temperature = meta_.temperature; std::vector blob_file_additions; { auto write_hint = cfd_->CalculateSSTWriteHint(0); - Env::IOPriority io_priority = GetRateLimiterPriorityForWrite(); + Env::IOPriority io_priority = GetRateLimiterPriority(); db_mutex_->Unlock(); if (log_buffer_) { log_buffer_->FlushBufferToLog(); @@ -903,7 +899,8 @@ Status FlushJob::WriteLevel0Table() { ", replication sequence (hex) %s\n", cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber(), Slice(m->GetReplicationSequence()).ToString(true).c_str()); - memtables.push_back(m->NewIterator(ro, &arena)); + memtables.push_back( + m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); auto* range_del_iter = m->NewRangeTombstoneIterator( ro, kMaxSequenceNumber, true /* immutable_memtable */); if (range_del_iter != nullptr) { @@ -930,7 +927,7 @@ Status FlushJob::WriteLevel0Table() { << GetFlushReasonString(flush_reason_); { - ScopedArenaIterator iter( + ScopedArenaPtr iter( NewMergingIterator(&cfd_->internal_comparator(), memtables.data(), static_cast(memtables.size()), &arena)); ROCKS_LOG_INFO(db_options_.info_log, @@ -971,29 +968,31 @@ Status FlushJob::WriteLevel0Table() { const std::string* const full_history_ts_low = (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_; + ReadOptions read_options(Env::IOActivity::kFlush); + read_options.rate_limiter_priority = io_priority; + const WriteOptions write_options(io_priority, Env::IOActivity::kFlush); TableBuilderOptions tboptions( - *cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(), - cfd_->int_tbl_prop_collector_factories(), output_compression_, - mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(), - 0 /* level */, false /* is_bottommost */, - TableFileCreationReason::kFlush, oldest_key_time, current_time, - db_id_, db_session_id_, 0 /* target_file_size */, - meta_.fd.GetNumber()); + *cfd_->ioptions(), mutable_cf_options_, read_options, write_options, + cfd_->internal_comparator(), cfd_->internal_tbl_prop_coll_factories(), + output_compression_, mutable_cf_options_.compression_opts, + cfd_->GetID(), cfd_->GetName(), 0 /* level */, + false /* is_bottommost */, TableFileCreationReason::kFlush, + oldest_key_time, current_time, db_id_, db_session_id_, + 0 /* target_file_size */, meta_.fd.GetNumber()); const SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence(); - const ReadOptions read_options(Env::IOActivity::kFlush); - s = BuildTable(dbname_, versions_, db_options_, tboptions, file_options_, - read_options, cfd_->table_cache(), iter.get(), - std::move(range_del_iters), &meta_, &blob_file_additions, - existing_snapshots_, earliest_write_conflict_snapshot_, - job_snapshot_seq, snapshot_checker_, - mutable_cf_options_.paranoid_file_checks, - cfd_->internal_stats(), &io_s, io_tracer_, - BlobFileCreationReason::kFlush, seqno_to_time_mapping_, - event_logger_, job_context_->job_id, io_priority, - &table_properties_, write_hint, full_history_ts_low, - blob_callback_, base_, &num_input_entries, - &memtable_payload_bytes, &memtable_garbage_bytes); + + s = BuildTable( + dbname_, versions_, db_options_, tboptions, file_options_, + cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_, + &blob_file_additions, existing_snapshots_, + earliest_write_conflict_snapshot_, job_snapshot_seq, + snapshot_checker_, mutable_cf_options_.paranoid_file_checks, + cfd_->internal_stats(), &io_s, io_tracer_, + BlobFileCreationReason::kFlush, seqno_to_time_mapping_.get(), + event_logger_, job_context_->job_id, &table_properties_, write_hint, + full_history_ts_low, blob_callback_, base_, &num_input_entries, + &memtable_payload_bytes, &memtable_garbage_bytes); TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:s", &s); // TODO: Cleanup io_status in BuildTable and table builders assert(!s.ok() || io_s.ok()); @@ -1097,7 +1096,7 @@ Status FlushJob::WriteLevel0Table() { return s; } -Env::IOPriority FlushJob::GetRateLimiterPriorityForWrite() { +Env::IOPriority FlushJob::GetRateLimiterPriority() { if (versions_ && versions_->GetColumnFamilySet() && versions_->GetColumnFamilySet()->write_controller()) { WriteController* write_controller = @@ -1188,8 +1187,9 @@ Status FlushJob::MaybeIncreaseFullHistoryTsLowToAboveCutoffUDT() { edit.SetColumnFamily(cfd_->GetID()); edit.SetFullHistoryTsLow(new_full_history_ts_low); return versions_->LogAndApply(cfd_, *cfd_->GetLatestMutableCFOptions(), - ReadOptions(), &edit, db_mutex_, - output_file_directory_); + ReadOptions(Env::IOActivity::kFlush), + WriteOptions(Env::IOActivity::kFlush), &edit, + db_mutex_, output_file_directory_); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/flush_job.h b/db/flush_job.h index aef33ef423a..337e9cd9bc1 100644 --- a/db/flush_job.h +++ b/db/flush_job.h @@ -39,7 +39,6 @@ #include "rocksdb/listener.h" #include "rocksdb/memtablerep.h" #include "rocksdb/transaction_log.h" -#include "table/scoped_arena_iterator.h" #include "util/autovector.h" #include "util/stop_watch.h" #include "util/thread_local.h" @@ -73,7 +72,7 @@ class FlushJob { EventLogger* event_logger, bool measure_io_stats, const bool sync_output_directory, const bool write_manifest, Env::Priority thread_pri, const std::shared_ptr& io_tracer, - const SeqnoToTimeMapping& seq_time_mapping, + std::shared_ptr seqno_to_time_mapping, const std::string& db_id = "", const std::string& db_session_id = "", std::string full_history_ts_low = "", BlobFileCompletionCallback* blob_callback = nullptr); @@ -129,7 +128,7 @@ class FlushJob { Status MemPurge(); bool MemPurgeDecider(double threshold); // The rate limiter priority (io_priority) is determined dynamically here. - Env::IOPriority GetRateLimiterPriorityForWrite(); + Env::IOPriority GetRateLimiterPriority(); std::unique_ptr GetFlushJobInfo() const; // Require db_mutex held. @@ -157,7 +156,7 @@ class FlushJob { // this job. All memtables in this column family with an ID smaller than or // equal to max_memtable_id_ will be selected for flush. uint64_t max_memtable_id_; - const FileOptions file_options_; + FileOptions file_options_; VersionSet* versions_; InstrumentedMutex* db_mutex_; std::atomic* shutting_down_; @@ -210,10 +209,14 @@ class FlushJob { const std::string full_history_ts_low_; BlobFileCompletionCallback* blob_callback_; - // reference to the seqno_to_time_mapping_ in db_impl.h, not safe to read - // without db mutex - const SeqnoToTimeMapping& db_impl_seqno_to_time_mapping_; - SeqnoToTimeMapping seqno_to_time_mapping_; + // Shared copy of DB's seqno to time mapping stored in SuperVersion. The + // ownership is shared with this FlushJob when it's created. + // FlushJob accesses and ref counts immutable MemTables directly via + // `MemTableListVersion` instead of ref `SuperVersion`, so we need to give + // the flush job shared ownership of the mapping. + // Note this is only installed when seqno to time recording feature is + // enables, so it could be nullptr. + std::shared_ptr seqno_to_time_mapping_; // Keeps track of the newest user-defined timestamp for this flush job if // `persist_user_defined_timestamps` flag is false. diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 21d1571a05e..3ffb77d5378 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -25,6 +25,19 @@ #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { +namespace { +std::string ValueWithWriteTime(std::string val, uint64_t write_time = 0) { + std::string result = val; + PutFixed64(&result, write_time); + return result; +} +std::string ValueWithPreferredSeqno(std::string val, + SequenceNumber preferred_seqno = 0) { + std::string result = val; + PutFixed64(&result, preferred_seqno); + return result; +} +} // namespace // TODO(icanadi) Mock out everything else: // 1. VersionSet @@ -55,7 +68,7 @@ class FlushJobTestBase : public testing::Test { } void NewDB() { - ASSERT_OK(SetIdentityFile(env_, dbname_)); + ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); VersionEdit new_db; new_db.SetLogNumber(0); @@ -89,19 +102,19 @@ class FlushJobTestBase : public testing::Test { log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); - s = log.AddRecord(record); + s = log.AddRecord(WriteOptions(), record); ASSERT_OK(s); for (const auto& e : new_cfs) { record.clear(); e.EncodeTo(&record); - s = log.AddRecord(record); + s = log.AddRecord(WriteOptions(), record); ASSERT_OK(s); } } ASSERT_OK(s); // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); } @@ -132,7 +145,7 @@ class FlushJobTestBase : public testing::Test { &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); EXPECT_OK(versions_->Recover(column_families, false)); } @@ -156,7 +169,7 @@ class FlushJobTestBase : public testing::Test { bool persist_udt_ = true; bool paranoid_file_checks_ = false; - SeqnoToTimeMapping empty_seqno_to_time_mapping_; + std::shared_ptr empty_seqno_to_time_mapping_; }; class FlushJobTest : public FlushJobTestBase { @@ -588,7 +601,7 @@ TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) { Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); // When the state from WriteController is normal. - ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_HIGH); + ASSERT_EQ(flush_job.GetRateLimiterPriority(), Env::IO_HIGH); WriteController* write_controller = flush_job.versions_->GetColumnFamilySet()->write_controller(); @@ -597,17 +610,80 @@ TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) { // When the state from WriteController is Delayed. std::unique_ptr delay_token = write_controller->GetDelayToken(1000000); - ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_USER); + ASSERT_EQ(flush_job.GetRateLimiterPriority(), Env::IO_USER); } { // When the state from WriteController is Stopped. std::unique_ptr stop_token = write_controller->GetStopToken(); - ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_USER); + ASSERT_EQ(flush_job.GetRateLimiterPriority(), Env::IO_USER); } } +TEST_F(FlushJobTest, ReplaceTimedPutWriteTimeWithPreferredSeqno) { + JobContext job_context(0); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + new_mem->Ref(); + std::shared_ptr seqno_to_time_mapping = + std::make_shared(); + // Seqno: 10, 11, ... 20, + // Time: ... 500 ... 600 + // GetProximalSeqnoBeforeTime(500) -> 10 + // GetProximalSeqnoBeforeTime(600) -> 20 + seqno_to_time_mapping->Append(10, 500); + seqno_to_time_mapping->Append(20, 600); + + ASSERT_OK(new_mem->Add(SequenceNumber(15), kTypeValuePreferredSeqno, "bar", + ValueWithWriteTime("bval", 500), + nullptr /*kv_prot_info*/)); + ASSERT_OK(new_mem->Add(SequenceNumber(18), kTypeValuePreferredSeqno, "foo", + ValueWithWriteTime("fval", 600), + nullptr /*kv_prot_info*/)); + + auto inserted_entries = mock::MakeMockFile(); + InternalKey smallest_internal_key("bar", SequenceNumber(15), + kTypeValuePreferredSeqno); + inserted_entries.push_back({smallest_internal_key.Encode().ToString(), + ValueWithPreferredSeqno("bval", 10)}); + InternalKey largest_internal_key("foo", SequenceNumber(18), kTypeValue); + inserted_entries.push_back( + {largest_internal_key.Encode().ToString(), "fval"}); + autovector to_delete; + new_mem->ConstructFragmentedRangeTombstones(); + cfd->imm()->Add(new_mem, &to_delete); + for (auto& m : to_delete) { + delete m; + } + + EventLogger event_logger(db_options_.info_log.get()); + SnapshotChecker* snapshot_checker = nullptr; // not relevant + FlushJob flush_job( + dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, + *cfd->GetLatestMutableCFOptions(), + std::numeric_limits::max() /* memtable_id */, env_options_, + versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, + snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, + true, true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, seqno_to_time_mapping); + + FileMetaData file_meta; + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run(nullptr, &file_meta)); + mutex_.Unlock(); + + ASSERT_EQ(smallest_internal_key.Encode().ToString(), + file_meta.smallest.Encode().ToString()); + ASSERT_EQ(largest_internal_key.Encode().ToString(), + file_meta.largest.Encode().ToString()); + mock_table_factory_->AssertSingleFile(inserted_entries); + job_context.Clean(); +} + // Test parameters: // param 0): paranoid file check // param 1): user-defined timestamp test mode diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index c7691560eb8..a4cbdb46679 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -289,7 +289,7 @@ struct SVCleanupParams { // Used in PinnedIteratorsManager to release pinned SuperVersion void ForwardIterator::DeferredSVCleanup(void* arg) { - auto d = reinterpret_cast(arg); + auto d = static_cast(arg); ForwardIterator::SVCleanup(d->db, d->sv, d->background_purge_on_iterator_cleanup); delete d; @@ -611,6 +611,11 @@ Slice ForwardIterator::key() const { return current_->key(); } +uint64_t ForwardIterator::write_unix_time() const { + assert(valid_); + return current_->write_unix_time(); +} + Slice ForwardIterator::value() const { assert(valid_); return current_->value(); @@ -648,7 +653,7 @@ Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) { *prop = std::to_string(sv_->version_number); return Status::OK(); } - return Status::InvalidArgument(); + return Status::InvalidArgument("Unrecognized property: " + prop_name); } void ForwardIterator::SetPinnedItersMgr( @@ -704,8 +709,12 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { } ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), kMaxSequenceNumber /* upper_bound */); - mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_); - sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_); + UnownedPtr seqno_to_time_mapping = + sv_->GetSeqnoToTimeMapping(); + mutable_iter_ = + sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_); + sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_, + &arena_); if (!read_options_.ignore_range_deletions) { std::unique_ptr range_del_iter( sv_->mem->NewRangeTombstoneIterator( @@ -769,8 +778,12 @@ void ForwardIterator::RenewIterators() { } imm_iters_.clear(); - mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_); - svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_); + UnownedPtr seqno_to_time_mapping = + svnew->GetSeqnoToTimeMapping(); + mutable_iter_ = + svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_); + svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_, + &arena_); ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), kMaxSequenceNumber /* upper_bound */); if (!read_options_.ignore_range_deletions) { diff --git a/db/forward_iterator.h b/db/forward_iterator.h index cb418aeeb0a..9f1b4379b9d 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -70,19 +70,19 @@ class ForwardIterator : public InternalIterator { valid_ = false; } - virtual bool Valid() const override; + bool Valid() const override; void SeekToFirst() override; - virtual void Seek(const Slice& target) override; - virtual void Next() override; - virtual Slice key() const override; - virtual Slice value() const override; - virtual Status status() const override; - virtual bool PrepareValue() override; - virtual Status GetProperty(std::string prop_name, std::string* prop) override; - virtual void SetPinnedItersMgr( - PinnedIteratorsManager* pinned_iters_mgr) override; - virtual bool IsKeyPinned() const override; - virtual bool IsValuePinned() const override; + void Seek(const Slice& target) override; + void Next() override; + Slice key() const override; + Slice value() const override; + uint64_t write_unix_time() const override; + Status status() const override; + bool PrepareValue() override; + Status GetProperty(std::string prop_name, std::string* prop) override; + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override; + bool IsKeyPinned() const override; + bool IsValuePinned() const override; bool TEST_CheckDeletedIters(int* deleted_iters, int* num_iters); diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index f7b8a50aef0..4d5c65616e1 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -18,7 +18,6 @@ #include "file/random_access_file_reader.h" #include "logging/logging.h" #include "table/merging_iterator.h" -#include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" #include "table/table_builder.h" #include "table/unique_id_impl.h" @@ -126,9 +125,10 @@ Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number, } } if (!hardlink_files) { - status = - CopyFile(fs_.get(), path_outside_db, path_inside_db, 0, - db_options_.use_fsync, io_tracer_, Temperature::kUnknown); + // FIXME: temperature handling (like ExternalSstFileIngestionJob) + status = CopyFile(fs_.get(), path_outside_db, Temperature::kUnknown, + path_inside_db, Temperature::kUnknown, 0, + db_options_.use_fsync, io_tracer_); } if (!status.ok()) { break; @@ -355,7 +355,7 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( // in file_meta. if (file_meta.smallest.empty()) { assert(file_meta.largest.empty()); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; std::unique_ptr iter(table_reader->NewIterator( ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index f6c1a024839..89586bcd18e 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -326,7 +326,7 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithRangeTombstone) { const SstFileMetaData* file_meta = nullptr; for (const auto& level_meta : import_cf_meta.levels) { if (!level_meta.files.empty()) { - file_meta = &(level_meta.files[0]); + file_meta = level_meta.files.data(); break; } } @@ -389,7 +389,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { *metadata_ptr_, &import_cfh2_)); ASSERT_NE(import_cfh2_, nullptr); delete metadata_ptr_; - metadata_ptr_ = NULL; + metadata_ptr_ = nullptr; std::string value1, value2; diff --git a/db/internal_stats.cc b/db/internal_stats.cc index d690d352961..f43979a5020 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -1162,7 +1162,7 @@ bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) { bool InternalStats::HandleAggregatedTableProperties(std::string* value, Slice /*suffix*/) { std::shared_ptr tp; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties(read_options, &tp); if (!s.ok()) { @@ -1184,7 +1184,7 @@ static std::map MapUint64ValuesToString( bool InternalStats::HandleAggregatedTablePropertiesMap( std::map* values, Slice /*suffix*/) { std::shared_ptr tp; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties(read_options, &tp); if (!s.ok()) { @@ -1202,7 +1202,7 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values, return false; } std::shared_ptr tp; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties( read_options, &tp, static_cast(level)); @@ -1221,7 +1221,7 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap( return false; } std::shared_ptr tp; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties( read_options, &tp, static_cast(level)); @@ -1432,7 +1432,7 @@ bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value, bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, DBImpl* /*db*/, Version* version) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; *value = (version == nullptr) ? 0 @@ -1487,7 +1487,7 @@ bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/, ->compaction_options_fifo.allow_compaction) { return false; } - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; TablePropertiesCollection collection; auto s = cfd_->current()->GetPropertiesOfAllTables(read_options, &collection); @@ -2081,6 +2081,12 @@ void InternalStats::DumpCFStatsNoFileHistogram(bool is_periodic, interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001), interval_compact_micros / kMicrosInSec); value->append(buf); + + snprintf(buf, sizeof(buf), + "Estimated pending compaction bytes: %" PRIu64 "\n", + vstorage->estimated_compaction_needed_bytes()); + value->append(buf); + if (is_periodic) { cf_stats_snapshot_.compact_bytes_write = compact_bytes_write; cf_stats_snapshot_.compact_bytes_read = compact_bytes_read; diff --git a/db/internal_stats.h b/db/internal_stats.h index 58275b145db..70e3f827110 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -58,7 +58,7 @@ struct DBPropertyInfo { bool (DBImpl::*handle_string_dbimpl)(std::string* value); }; -extern const DBPropertyInfo* GetPropertyInfo(const Slice& property); +const DBPropertyInfo* GetPropertyInfo(const Slice& property); #undef SCORE enum class LevelStatType { @@ -432,7 +432,7 @@ class InternalStats { explicit CompactionStatsFull() : stats(), penultimate_level_stats() {} explicit CompactionStatsFull(CompactionReason reason, int c) - : stats(reason, c), penultimate_level_stats(reason, c){}; + : stats(reason, c), penultimate_level_stats(reason, c){} uint64_t TotalBytesWritten() const { uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob; diff --git a/db/job_context.h b/db/job_context.h index 48728f48d6b..272b79a2162 100644 --- a/db/job_context.h +++ b/db/job_context.h @@ -35,6 +35,12 @@ struct SuperVersionContext { std::unique_ptr new_superversion; // if nullptr no new superversion + // If not nullptr, a new seqno to time mapping is available to be installed. + // Otherwise, make a shared copy of the one in the existing SuperVersion and + // carry it over to the new SuperVersion. This is moved to the SuperVersion + // during installation. + std::shared_ptr new_seqno_to_time_mapping{nullptr}; + explicit SuperVersionContext(bool create_superversion = false) : new_superversion(create_superversion ? new SuperVersion() : nullptr) {} diff --git a/db/listener_test.cc b/db/listener_test.cc index 41577b92c17..d298a86e7ea 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -132,8 +132,8 @@ class TestCompactionListener : public EventListener { ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id); ASSERT_GT(ci.thread_id, 0U); - for (auto fl : {ci.input_files, ci.output_files}) { - for (auto fn : fl) { + for (const auto& fl : {ci.input_files, ci.output_files}) { + for (const auto& fn : fl) { auto it = ci.table_properties.find(fn); ASSERT_NE(it, ci.table_properties.end()); auto tp = it->second; @@ -237,7 +237,7 @@ class TestFlushListener : public EventListener { std::vector thread_list; ASSERT_OK(env_->GetThreadList(&thread_list)); bool found_match = false; - for (auto thread_status : thread_list) { + for (const auto& thread_status : thread_list) { if (thread_status.operation_type == ThreadStatus::OP_FLUSH || thread_status.operation_type == ThreadStatus::OP_COMPACTION) { if (thread_id == thread_status.thread_id) { @@ -893,7 +893,7 @@ class MemTableSealedListener : public EventListener { SequenceNumber latest_seq_number_; public: - MemTableSealedListener() {} + MemTableSealedListener() = default; void OnMemTableSealed(const MemTableInfo& info) override { latest_seq_number_ = info.first_seqno; } diff --git a/db/log_reader.cc b/db/log_reader.cc index 4e470616f05..da979a1ee1e 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -9,7 +9,7 @@ #include "db/log_reader.h" -#include +#include #include "file/sequence_file_reader.h" #include "port/lang.h" @@ -18,10 +18,9 @@ #include "util/coding.h" #include "util/crc32c.h" -namespace ROCKSDB_NAMESPACE { -namespace log { +namespace ROCKSDB_NAMESPACE::log { -Reader::Reporter::~Reporter() {} +Reader::Reporter::~Reporter() = default; Reader::Reader(std::shared_ptr info_log, std::unique_ptr&& _file, @@ -44,7 +43,7 @@ Reader::Reader(std::shared_ptr info_log, compression_type_record_read_(false), uncompress_(nullptr), hash_state_(nullptr), - uncompress_hash_state_(nullptr){}; + uncompress_hash_state_(nullptr){} Reader::~Reader() { delete[] backing_store_; @@ -259,6 +258,10 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, // writing a physical record but before completing the next; don't // treat it as a corruption, just ignore the entire logical record. scratch->clear(); + } else { + if (wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + ReportOldLogRecord(scratch->size()); + } } return false; } @@ -406,6 +409,12 @@ void Reader::ReportDrop(size_t bytes, const Status& reason) { } } +void Reader::ReportOldLogRecord(size_t bytes) { + if (reporter_ != nullptr) { + reporter_->OldLogRecord(bytes); + } +} + bool Reader::ReadMore(size_t* drop_size, int* error) { if (!eof_ && !read_error_) { // Last read was a full read, so this is a trailer to skip @@ -937,5 +946,4 @@ bool FragmentBufferedReader::TryReadFragment( } } -} // namespace log -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::log diff --git a/db/log_reader.h b/db/log_reader.h index 697d1b5d58c..6e4eded0916 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -45,6 +45,8 @@ class Reader { // Some corruption was detected. "size" is the approximate number // of bytes dropped due to the corruption. virtual void Corruption(size_t bytes, const Status& status) = 0; + + virtual void OldLogRecord(size_t /*bytes*/) {} }; // Create a reader that will return log records from "*file". @@ -202,6 +204,7 @@ class Reader { // buffer_ must be updated to remove the dropped bytes prior to invocation. void ReportCorruption(size_t bytes, const char* reason); void ReportDrop(size_t bytes, const Status& reason); + void ReportOldLogRecord(size_t bytes); void InitCompression(const CompressionTypeRecord& compression_record); diff --git a/db/log_test.cc b/db/log_test.cc index fa5e2aa0fcd..79ff02a04b7 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -19,8 +19,7 @@ #include "util/random.h" #include "utilities/memory_allocators.h" -namespace ROCKSDB_NAMESPACE { -namespace log { +namespace ROCKSDB_NAMESPACE::log { // Construct a string of the specified length made out of the supplied // partial string. @@ -185,9 +184,10 @@ class LogTest void Write(const std::string& msg, const UnorderedMap* cf_to_ts_sz = nullptr) { if (cf_to_ts_sz != nullptr && !cf_to_ts_sz->empty()) { - ASSERT_OK(writer_->MaybeAddUserDefinedTimestampSizeRecord(*cf_to_ts_sz)); + ASSERT_OK(writer_->MaybeAddUserDefinedTimestampSizeRecord(WriteOptions(), + *cf_to_ts_sz)); } - ASSERT_OK(writer_->AddRecord(Slice(msg))); + ASSERT_OK(writer_->AddRecord(WriteOptions(), Slice(msg))); } size_t WrittenBytes() const { return dest_contents().size(); } @@ -732,8 +732,8 @@ TEST_P(LogTest, Recycle) { std::unique_ptr dest_holder(new WritableFileWriter( std::move(sink), "" /* don't care */, FileOptions())); Writer recycle_writer(std::move(dest_holder), 123, true); - ASSERT_OK(recycle_writer.AddRecord(Slice("foooo"))); - ASSERT_OK(recycle_writer.AddRecord(Slice("bar"))); + ASSERT_OK(recycle_writer.AddRecord(WriteOptions(), Slice("foooo"))); + ASSERT_OK(recycle_writer.AddRecord(WriteOptions(), Slice("bar"))); ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2); ASSERT_EQ("foooo", Read()); ASSERT_EQ("bar", Read()); @@ -764,9 +764,10 @@ TEST_P(LogTest, RecycleWithTimestampSize) { UnorderedMap ts_sz_two = { {2, sizeof(uint64_t)}, }; - ASSERT_OK(recycle_writer.MaybeAddUserDefinedTimestampSizeRecord(ts_sz_two)); - ASSERT_OK(recycle_writer.AddRecord(Slice("foooo"))); - ASSERT_OK(recycle_writer.AddRecord(Slice("bar"))); + ASSERT_OK(recycle_writer.MaybeAddUserDefinedTimestampSizeRecord( + WriteOptions(), ts_sz_two)); + ASSERT_OK(recycle_writer.AddRecord(WriteOptions(), Slice("foooo"))); + ASSERT_OK(recycle_writer.AddRecord(WriteOptions(), Slice("bar"))); ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2); CheckRecordAndTimestampSize("foooo", ts_sz_two); CheckRecordAndTimestampSize("bar", ts_sz_two); @@ -853,12 +854,12 @@ class RetriableLogTest : public ::testing::TestWithParam { std::string contents() { return sink_->contents_; } void Encode(const std::string& msg) { - ASSERT_OK(log_writer_->AddRecord(Slice(msg))); + ASSERT_OK(log_writer_->AddRecord(WriteOptions(), Slice(msg))); } void Write(const Slice& data) { - ASSERT_OK(writer_->Append(data)); - ASSERT_OK(writer_->Sync(true)); + ASSERT_OK(writer_->Append(IOOptions(), data)); + ASSERT_OK(writer_->Sync(IOOptions(), true)); } bool TryRead(std::string* result) { @@ -991,7 +992,9 @@ INSTANTIATE_TEST_CASE_P(bool, RetriableLogTest, ::testing::Values(0, 2)); class CompressionLogTest : public LogTest { public: - Status SetupTestEnv() { return writer_->AddCompressionTypeRecord(); } + Status SetupTestEnv() { + return writer_->AddCompressionTypeRecord(WriteOptions()); + } }; TEST_P(CompressionLogTest, Empty) { @@ -1109,7 +1112,7 @@ TEST_P(CompressionLogTest, AlignedFragmentation) { // beginning of the block. while ((WrittenBytes() & (kBlockSize - 1)) >= kHeaderSize) { char entry = 'a'; - ASSERT_OK(writer_->AddRecord(Slice(&entry, 1))); + ASSERT_OK(writer_->AddRecord(WriteOptions(), Slice(&entry, 1))); num_filler_records++; } const std::vector wal_entries = { @@ -1167,7 +1170,7 @@ TEST_P(StreamingCompressionTest, Basic) { } allocator->Deallocate((void*)output_buffer); } while (remaining > 0); - std::string uncompressed_buffer = ""; + std::string uncompressed_buffer; int ret_val = 0; size_t output_pos; char* uncompressed_output_buffer = (char*)allocator->Allocate(kBlockSize); @@ -1202,8 +1205,7 @@ INSTANTIATE_TEST_CASE_P( kBlockSize * 2), ::testing::Values(CompressionType::kZSTD))); -} // namespace log -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::log int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff --git a/db/log_writer.cc b/db/log_writer.cc index 86e0286ccd5..e61efc9eefc 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -9,7 +9,7 @@ #include "db/log_writer.h" -#include +#include #include "file/writable_file_writer.h" #include "rocksdb/env.h" @@ -18,8 +18,7 @@ #include "util/crc32c.h" #include "util/udt_util.h" -namespace ROCKSDB_NAMESPACE { -namespace log { +namespace ROCKSDB_NAMESPACE::log { Writer::Writer(std::unique_ptr&& dest, uint64_t log_number, bool recycle_log_files, bool manual_flush, @@ -38,32 +37,46 @@ Writer::Writer(std::unique_ptr&& dest, uint64_t log_number, } Writer::~Writer() { + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); if (dest_) { - WriteBuffer().PermitUncheckedError(); + WriteBuffer(WriteOptions()).PermitUncheckedError(); } if (compress_) { delete compress_; } + ThreadStatusUtil::SetThreadOperation(cur_op_type); } -IOStatus Writer::WriteBuffer() { +IOStatus Writer::WriteBuffer(const WriteOptions& write_options) { if (dest_->seen_error()) { return IOStatus::IOError("Seen error. Skip writing buffer."); } - return dest_->Flush(); + IOOptions opts; + IOStatus s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (!s.ok()) { + return s; + } + return dest_->Flush(opts); } -IOStatus Writer::Close() { +IOStatus Writer::Close(const WriteOptions& write_options) { IOStatus s; - if (dest_) { - s = dest_->Close(); + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok() && dest_) { + s = dest_->Close(opts); dest_.reset(); } return s; } -IOStatus Writer::AddRecord(const Slice& slice, - Env::IOPriority rate_limiter_priority) { +IOStatus Writer::AddRecord(const WriteOptions& write_options, + const Slice& slice) { + if (dest_->seen_error()) { + return IOStatus::IOError("Seen error. Skip writing buffer."); + } const char* ptr = slice.data(); size_t left = slice.size(); @@ -83,83 +96,87 @@ IOStatus Writer::AddRecord(const Slice& slice, } IOStatus s; - do { - const int64_t leftover = kBlockSize - block_offset_; - assert(leftover >= 0); - if (leftover < header_size) { - // Switch to a new block - if (leftover > 0) { - // Fill the trailer (literal below relies on kHeaderSize and - // kRecyclableHeaderSize being <= 11) - assert(header_size <= 11); - s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", - static_cast(leftover)), - 0 /* crc32c_checksum */, rate_limiter_priority); - if (!s.ok()) { - break; + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + do { + const int64_t leftover = kBlockSize - block_offset_; + assert(leftover >= 0); + if (leftover < header_size) { + // Switch to a new block + if (leftover > 0) { + // Fill the trailer (literal below relies on kHeaderSize and + // kRecyclableHeaderSize being <= 11) + assert(header_size <= 11); + s = dest_->Append(opts, + Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + static_cast(leftover)), + 0 /* crc32c_checksum */); + if (!s.ok()) { + break; + } } + block_offset_ = 0; } - block_offset_ = 0; - } - // Invariant: we never leave < header_size bytes in a block. - assert(static_cast(kBlockSize - block_offset_) >= header_size); - - const size_t avail = kBlockSize - block_offset_ - header_size; - - // Compress the record if compression is enabled. - // Compress() is called at least once (compress_start=true) and after the - // previous generated compressed chunk is written out as one or more - // physical records (left=0). - if (compress_ && (compress_start || left == 0)) { - compress_remaining = compress_->Compress(slice.data(), slice.size(), - compressed_buffer_.get(), &left); - - if (compress_remaining < 0) { - // Set failure status - s = IOStatus::IOError("Unexpected WAL compression error"); - s.SetDataLoss(true); - break; - } else if (left == 0) { - // Nothing left to compress - if (!compress_start) { + // Invariant: we never leave < header_size bytes in a block. + assert(static_cast(kBlockSize - block_offset_) >= header_size); + + const size_t avail = kBlockSize - block_offset_ - header_size; + + // Compress the record if compression is enabled. + // Compress() is called at least once (compress_start=true) and after the + // previous generated compressed chunk is written out as one or more + // physical records (left=0). + if (compress_ && (compress_start || left == 0)) { + compress_remaining = compress_->Compress( + slice.data(), slice.size(), compressed_buffer_.get(), &left); + + if (compress_remaining < 0) { + // Set failure status + s = IOStatus::IOError("Unexpected WAL compression error"); + s.SetDataLoss(true); break; + } else if (left == 0) { + // Nothing left to compress + if (!compress_start) { + break; + } } + compress_start = false; + ptr = compressed_buffer_.get(); } - compress_start = false; - ptr = compressed_buffer_.get(); - } - const size_t fragment_length = (left < avail) ? left : avail; - - RecordType type; - const bool end = (left == fragment_length && compress_remaining == 0); - if (begin && end) { - type = recycle_log_files_ ? kRecyclableFullType : kFullType; - } else if (begin) { - type = recycle_log_files_ ? kRecyclableFirstType : kFirstType; - } else if (end) { - type = recycle_log_files_ ? kRecyclableLastType : kLastType; - } else { - type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType; - } - - s = EmitPhysicalRecord(type, ptr, fragment_length, rate_limiter_priority); - ptr += fragment_length; - left -= fragment_length; - begin = false; - } while (s.ok() && (left > 0 || compress_remaining > 0)); + const size_t fragment_length = (left < avail) ? left : avail; + + RecordType type; + const bool end = (left == fragment_length && compress_remaining == 0); + if (begin && end) { + type = recycle_log_files_ ? kRecyclableFullType : kFullType; + } else if (begin) { + type = recycle_log_files_ ? kRecyclableFirstType : kFirstType; + } else if (end) { + type = recycle_log_files_ ? kRecyclableLastType : kLastType; + } else { + type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType; + } + s = EmitPhysicalRecord(write_options, type, ptr, fragment_length); + ptr += fragment_length; + left -= fragment_length; + begin = false; + } while (s.ok() && (left > 0 || compress_remaining > 0)); + } if (s.ok()) { if (!manual_flush_) { - s = dest_->Flush(rate_limiter_priority); + s = dest_->Flush(opts); } } return s; } -IOStatus Writer::AddCompressionTypeRecord() { +IOStatus Writer::AddCompressionTypeRecord(const WriteOptions& write_options) { // Should be the first record assert(block_offset_ == 0); @@ -168,14 +185,22 @@ IOStatus Writer::AddCompressionTypeRecord() { return IOStatus::OK(); } + if (dest_->seen_error()) { + return IOStatus::IOError("Seen error. Skip writing buffer."); + } + CompressionTypeRecord record(compression_type_); std::string encode; record.EncodeTo(&encode); - IOStatus s = - EmitPhysicalRecord(kSetCompressionType, encode.data(), encode.size()); + IOStatus s = EmitPhysicalRecord(write_options, kSetCompressionType, + encode.data(), encode.size()); if (s.ok()) { if (!manual_flush_) { - s = dest_->Flush(); + IOOptions io_opts; + s = WritableFileWriter::PrepareIOOptions(write_options, io_opts); + if (s.ok()) { + s = dest_->Flush(io_opts); + } } // Initialize fields required for compression const size_t max_output_buffer_len = @@ -197,8 +222,8 @@ IOStatus Writer::AddCompressionTypeRecord() { } IOStatus Writer::MaybeAddUserDefinedTimestampSizeRecord( - const UnorderedMap& cf_to_ts_sz, - Env::IOPriority rate_limiter_priority) { + const WriteOptions& write_options, + const UnorderedMap& cf_to_ts_sz) { std::vector> ts_sz_to_record; for (const auto& [cf_id, ts_sz] : cf_to_ts_sz) { if (recorded_cf_to_ts_sz_.count(cf_id) != 0) { @@ -219,14 +244,14 @@ IOStatus Writer::MaybeAddUserDefinedTimestampSizeRecord( record.EncodeTo(&encoded); RecordType type = recycle_log_files_ ? kRecyclableUserDefinedTimestampSizeType : kUserDefinedTimestampSizeType; - return EmitPhysicalRecord(type, encoded.data(), encoded.size(), - rate_limiter_priority); + return EmitPhysicalRecord(write_options, type, encoded.data(), + encoded.size()); } bool Writer::BufferIsEmpty() { return dest_->BufferIsEmpty(); } -IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n, - Env::IOPriority rate_limiter_priority) { +IOStatus Writer::EmitPhysicalRecord(const WriteOptions& write_options, + RecordType t, const char* ptr, size_t n) { assert(n <= 0xffff); // Must fit in two bytes size_t header_size; @@ -266,14 +291,16 @@ IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n, EncodeFixed32(buf, crc); // Write the header and the payload - IOStatus s = dest_->Append(Slice(buf, header_size), 0 /* crc32c_checksum */, - rate_limiter_priority); + IOOptions opts; + IOStatus s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + s = dest_->Append(opts, Slice(buf, header_size), 0 /* crc32c_checksum */); + } if (s.ok()) { - s = dest_->Append(Slice(ptr, n), payload_crc, rate_limiter_priority); + s = dest_->Append(opts, Slice(ptr, n), payload_crc); } block_offset_ += header_size + n; return s; } -} // namespace log -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::log diff --git a/db/log_writer.h b/db/log_writer.h index 7a64a856015..1bbf72569ec 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -86,9 +86,8 @@ class Writer { ~Writer(); - IOStatus AddRecord(const Slice& slice, - Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); - IOStatus AddCompressionTypeRecord(); + IOStatus AddRecord(const WriteOptions& write_options, const Slice& slice); + IOStatus AddCompressionTypeRecord(const WriteOptions& write_options); // If there are column families in `cf_to_ts_sz` not included in // `recorded_cf_to_ts_sz_` and its user-defined timestamp size is non-zero, @@ -96,17 +95,17 @@ class Writer { // kRecyclableUserDefinedTimestampSizeType for these column families. // This timestamp size record applies to all subsequent records. IOStatus MaybeAddUserDefinedTimestampSizeRecord( - const UnorderedMap& cf_to_ts_sz, - Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); + const WriteOptions& write_options, + const UnorderedMap& cf_to_ts_sz); WritableFileWriter* file() { return dest_.get(); } const WritableFileWriter* file() const { return dest_.get(); } uint64_t get_log_number() const { return log_number_; } - IOStatus WriteBuffer(); + IOStatus WriteBuffer(const WriteOptions& write_options); - IOStatus Close(); + IOStatus Close(const WriteOptions& write_options); bool BufferIsEmpty(); @@ -121,9 +120,8 @@ class Writer { // record type stored in the header. uint32_t type_crc_[kMaxRecordType + 1]; - IOStatus EmitPhysicalRecord( - RecordType type, const char* ptr, size_t length, - Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); + IOStatus EmitPhysicalRecord(const WriteOptions& write_options, + RecordType type, const char* ptr, size_t length); // If true, it does not flush after each write. Instead it relies on the upper // layer to manually does the flush by calling ::WriteBuffer() diff --git a/db/malloc_stats.cc b/db/malloc_stats.cc index 641e01f9a39..0fd641630a1 100644 --- a/db/malloc_stats.cc +++ b/db/malloc_stats.cc @@ -9,8 +9,7 @@ #include "db/malloc_stats.h" -#include - +#include #include #include "port/jemalloc_helper.h" @@ -25,7 +24,7 @@ struct MallocStatus { }; static void GetJemallocStatus(void* mstat_arg, const char* status) { - MallocStatus* mstat = reinterpret_cast(mstat_arg); + MallocStatus* mstat = static_cast(mstat_arg); size_t status_len = status ? strlen(status) : 0; size_t buf_size = (size_t)(mstat->end - mstat->cur); if (!status_len || status_len > buf_size) { diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index 95b099a66dc..e8403106542 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -58,7 +58,7 @@ class ManualCompactionTest : public testing::Test { class DestroyAllCompactionFilter : public CompactionFilter { public: - DestroyAllCompactionFilter() {} + DestroyAllCompactionFilter() = default; bool Filter(int /*level*/, const Slice& /*key*/, const Slice& existing_value, std::string* /*new_value*/, diff --git a/db/memtable.cc b/db/memtable.cc index 4b2360d4050..ae89c2cf093 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -61,6 +61,8 @@ ImmutableMemTableOptions::ImmutableMemTableOptions( inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks), inplace_callback(ioptions.inplace_callback), max_successive_merges(mutable_cf_options.max_successive_merges), + strict_max_successive_merges( + mutable_cf_options.strict_max_successive_merges), statistics(ioptions.stats), merge_operator(ioptions.merge_operator.get()), info_log(ioptions.logger), @@ -379,11 +381,13 @@ const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator : public InternalIterator { public: MemTableIterator(const MemTable& mem, const ReadOptions& read_options, + UnownedPtr seqno_to_time_mapping, Arena* arena, bool use_range_del_table = false) : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), comparator_(mem.comparator_), valid_(false), + seqno_to_time_mapping_(seqno_to_time_mapping), arena_mode_(arena != nullptr), value_pinned_( !mem.GetImmutableMemTableOptions()->inplace_update_support), @@ -514,6 +518,21 @@ class MemTableIterator : public InternalIterator { assert(Valid()); return GetLengthPrefixedSlice(iter_->key()); } + + uint64_t write_unix_time() const override { + assert(Valid()); + ParsedInternalKey pikey; + Status s = ParseInternalKey(key(), &pikey, /*log_err_key=*/false); + if (!s.ok()) { + return std::numeric_limits::max(); + } else if (kTypeValuePreferredSeqno == pikey.type) { + return ParsePackedValueForWriteTime(value()); + } else if (!seqno_to_time_mapping_ || seqno_to_time_mapping_->Empty()) { + return std::numeric_limits::max(); + } + return seqno_to_time_mapping_->GetProximalTimeBeforeSeqno(pikey.sequence); + } + Slice value() const override { assert(Valid()); Slice key_slice = GetLengthPrefixedSlice(iter_->key()); @@ -538,6 +557,8 @@ class MemTableIterator : public InternalIterator { const MemTable::KeyComparator comparator_; MemTableRep::Iterator* iter_; bool valid_; + // The seqno to time mapping is owned by the SuperVersion. + UnownedPtr seqno_to_time_mapping_; bool arena_mode_; bool value_pinned_; uint32_t protection_bytes_per_key_; @@ -556,11 +577,13 @@ class MemTableIterator : public InternalIterator { } }; -InternalIterator* MemTable::NewIterator(const ReadOptions& read_options, - Arena* arena) { +InternalIterator* MemTable::NewIterator( + const ReadOptions& read_options, + UnownedPtr seqno_to_time_mapping, Arena* arena) { assert(arena != nullptr); auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); - return new (mem) MemTableIterator(*this, read_options, arena); + return new (mem) + MemTableIterator(*this, read_options, seqno_to_time_mapping, arena); } FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( @@ -594,9 +617,9 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( if (!cache->initialized.load(std::memory_order_acquire)) { cache->reader_mutex.lock(); if (!cache->tombstones) { - auto* unfragmented_iter = - new MemTableIterator(*this, read_options, nullptr /* arena */, - true /* use_range_del_table */); + auto* unfragmented_iter = new MemTableIterator( + *this, read_options, nullptr /* seqno_to_time_mapping= */, + nullptr /* arena */, true /* use_range_del_table */); cache->tombstones.reset(new FragmentedRangeTombstoneList( std::unique_ptr(unfragmented_iter), comparator_.comparator)); @@ -614,10 +637,10 @@ void MemTable::ConstructFragmentedRangeTombstones() { assert(!IsFragmentedRangeTombstonesConstructed(false)); // There should be no concurrent Construction if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) { - // TODO: plumb Env::IOActivity - auto* unfragmented_iter = - new MemTableIterator(*this, ReadOptions(), nullptr /* arena */, - true /* use_range_del_table */); + // TODO: plumb Env::IOActivity, Env::IOPriority + auto* unfragmented_iter = new MemTableIterator( + *this, ReadOptions(), nullptr /*seqno_to_time_mapping=*/, + nullptr /* arena */, true /* use_range_del_table */); fragmented_range_tombstone_list_ = std::make_unique( @@ -922,7 +945,7 @@ struct Saver { static bool SaveValue(void* arg, const char* entry) { TEST_SYNC_POINT_CALLBACK("Memtable::SaveValue:Begin:entry", &entry); - Saver* s = reinterpret_cast(arg); + Saver* s = static_cast(arg); assert(s != nullptr); assert(!s->value || !s->columns); @@ -1007,7 +1030,8 @@ static bool SaveValue(void* arg, const char* entry) { if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex || type == kTypeWideColumnEntity || type == kTypeDeletion || - type == kTypeSingleDeletion || type == kTypeDeletionWithTimestamp) && + type == kTypeSingleDeletion || type == kTypeDeletionWithTimestamp || + type == kTypeValuePreferredSeqno) && max_covering_tombstone_seq > seq) { type = kTypeRangeDeletion; } @@ -1059,13 +1083,18 @@ static bool SaveValue(void* arg, const char* entry) { return false; } - case kTypeValue: { + case kTypeValue: + case kTypeValuePreferredSeqno: { if (s->inplace_update_support) { s->mem->GetLock(s->key->user_key())->ReadLock(); } Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + if (type == kTypeValuePreferredSeqno) { + v = ParsePackedValueForValue(v); + } + *(s->status) = Status::OK(); if (!s->do_merge) { @@ -1087,8 +1116,8 @@ static bool SaveValue(void* arg, const char* entry) { merge_operator, s->key->user_key(), MergeHelper::kPlainBaseValue, v, merge_context->GetOperands(), s->logger, s->statistics, s->clock, - /* update_num_ops_stats */ true, s->value, s->columns, - /* op_failure_scope */ nullptr); + /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr, + s->value, s->columns); } } else if (s->value) { s->value->assign(v.data(), v.size()); @@ -1140,8 +1169,8 @@ static bool SaveValue(void* arg, const char* entry) { *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), MergeHelper::kWideBaseValue, v, merge_context->GetOperands(), s->logger, s->statistics, - s->clock, /* update_num_ops_stats */ true, s->value, s->columns, - /* op_failure_scope */ nullptr); + s->clock, /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr, s->value, s->columns); } } else if (s->value) { Slice value_of_default; @@ -1178,8 +1207,8 @@ static bool SaveValue(void* arg, const char* entry) { *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), MergeHelper::kNoBaseValue, merge_context->GetOperands(), s->logger, s->statistics, - s->clock, /* update_num_ops_stats */ true, s->value, s->columns, - /* op_failure_scope */ nullptr); + s->clock, /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr, s->value, s->columns); } else { // We have found a final value (a base deletion) and have newer // merge operands that we do not intend to merge. Nothing remains @@ -1218,13 +1247,21 @@ static bool SaveValue(void* arg, const char* entry) { *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), MergeHelper::kNoBaseValue, merge_context->GetOperands(), s->logger, s->statistics, - s->clock, /* update_num_ops_stats */ true, s->value, s->columns, - /* op_failure_scope */ nullptr); + s->clock, /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr, s->value, s->columns); } *(s->found_final_value) = true; return false; } + if (merge_context->get_merge_operands_options != nullptr && + merge_context->get_merge_operands_options->continue_cb != nullptr && + !merge_context->get_merge_operands_options->continue_cb(v)) { + // We were told not to continue. + *(s->found_final_value) = true; + return false; + } + return true; } default: { @@ -1388,7 +1425,7 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, range_indexes[num_keys++] = iter.index(); } } - bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]); + bloom_filter_->MayContain(num_keys, bloom_keys.data(), may_match.data()); for (int i = 0; i < num_keys; ++i) { if (!may_match[i]) { temp_range.SkipIndex(range_indexes[i]); @@ -1610,7 +1647,8 @@ Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key, return Status::NotFound(); } -size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { +size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key, + size_t limit) { Slice memkey = key.memtable_key(); // A total ordered iterator is costly for some memtablerep (prefix aware @@ -1622,7 +1660,7 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { size_t num_successive_merges = 0; - for (; iter->Valid(); iter->Next()) { + for (; iter->Valid() && num_successive_merges < limit; iter->Next()) { const char* entry = iter->key(); uint32_t key_length = 0; const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); diff --git a/db/memtable.h b/db/memtable.h index d3d2322c783..4cc3006cdfd 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -20,6 +20,7 @@ #include "db/kv_checksum.h" #include "db/range_tombstone_fragmenter.h" #include "db/read_callback.h" +#include "db/seqno_to_time_mapping.h" #include "db/version_edit.h" #include "memory/allocator.h" #include "memory/concurrent_arena.h" @@ -28,6 +29,7 @@ #include "rocksdb/db.h" #include "rocksdb/memtablerep.h" #include "table/multiget_context.h" +#include "util/cast_util.h" #include "util/dynamic_bloom.h" #include "util/hash.h" #include "util/hash_containers.h" @@ -54,6 +56,7 @@ struct ImmutableMemTableOptions { Slice delta_value, std::string* merged_value); size_t max_successive_merges; + bool strict_max_successive_merges; Statistics* statistics; MergeOperator* merge_operator; Logger* info_log; @@ -90,10 +93,10 @@ class MemTable { struct KeyComparator : public MemTableRep::KeyComparator { const InternalKeyComparator comparator; explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {} - virtual int operator()(const char* prefix_len_key1, - const char* prefix_len_key2) const override; - virtual int operator()(const char* prefix_len_key, - const DecodedType& key) const override; + int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const override; + int operator()(const char* prefix_len_key, + const DecodedType& key) const override; }; // MemTables are reference counted. The initial reference count @@ -202,7 +205,11 @@ class MemTable { // arena: If not null, the arena needs to be used to allocate the Iterator. // Calling ~Iterator of the iterator will destroy all the states but // those allocated in arena. - InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena); + // seqno_to_time_mapping: it's used to support return write unix time for the + // data, currently only needed for iterators serving user reads. + InternalIterator* NewIterator( + const ReadOptions& read_options, + UnownedPtr seqno_to_time_mapping, Arena* arena); // Returns an iterator that yields the range tombstones of the memtable. // The caller must ensure that the underlying MemTable remains live @@ -319,9 +326,10 @@ class MemTable { const ProtectionInfoKVOS64* kv_prot_info); // Returns the number of successive merge entries starting from the newest - // entry for the key up to the last non-merge entry or last entry for the - // key in the memtable. - size_t CountSuccessiveMergeEntries(const LookupKey& key); + // entry for the key. The count ends when the oldest entry in the memtable + // with which the newest entry would be merged is reached, or the count + // reaches `limit`. + size_t CountSuccessiveMergeEntries(const LookupKey& key, size_t limit); // Update counters and flush status after inserting a whole write batch // Used in concurrent memtable inserts. @@ -713,6 +721,6 @@ class MemTable { void MaybeUpdateNewestUDT(const Slice& user_key); }; -extern const char* EncodeKey(std::string* scratch, const Slice& target); +const char* EncodeKey(std::string* scratch, const Slice& target); } // namespace ROCKSDB_NAMESPACE diff --git a/db/memtable_list.cc b/db/memtable_list.cc index ebf0813c2d8..6a7283ddf7a 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -211,18 +211,22 @@ Status MemTableListVersion::AddRangeTombstoneIterators( } void MemTableListVersion::AddIterators( - const ReadOptions& options, std::vector* iterator_list, - Arena* arena) { + const ReadOptions& options, + UnownedPtr seqno_to_time_mapping, + std::vector* iterator_list, Arena* arena) { for (auto& m : memlist_) { - iterator_list->push_back(m->NewIterator(options, arena)); + iterator_list->push_back( + m->NewIterator(options, seqno_to_time_mapping, arena)); } } -void MemTableListVersion::AddIterators(const ReadOptions& options, - MergeIteratorBuilder* merge_iter_builder, - bool add_range_tombstone_iter) { +void MemTableListVersion::AddIterators( + const ReadOptions& options, + UnownedPtr seqno_to_time_mapping, + MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter) { for (auto& m : memlist_) { - auto mem_iter = m->NewIterator(options, merge_iter_builder->GetArena()); + auto mem_iter = m->NewIterator(options, seqno_to_time_mapping, + merge_iter_builder->GetArena()); if (!add_range_tombstone_iter || options.ignore_range_deletions) { merge_iter_builder->AddIterator(mem_iter); } else { @@ -502,6 +506,7 @@ Status MemTableList::TryInstallMemtableFlushResults( mu->AssertHeld(); const ReadOptions read_options(Env::IOActivity::kFlush); + const WriteOptions write_options(Env::IOActivity::kFlush); // Flush was successful // Record the status on the memtable object. Either this call or a call by a @@ -614,10 +619,10 @@ Status MemTableList::TryInstallMemtableFlushResults( }; if (write_edits) { // this can release and reacquire the mutex. - s = vset->LogAndApply(cfd, mutable_cf_options, read_options, edit_list, - mu, db_directory, /*new_descriptor_log=*/false, - /*column_family_options=*/nullptr, - manifest_write_cb); + s = vset->LogAndApply( + cfd, mutable_cf_options, read_options, write_options, edit_list, mu, + db_directory, /*new_descriptor_log=*/false, + /*column_family_options=*/nullptr, manifest_write_cb); } else { // If write_edit is false (e.g: successful mempurge), // then remove old memtables, wake up manifest write queue threads, @@ -835,6 +840,7 @@ Status InstallMemtableAtomicFlushResults( mu->AssertHeld(); const ReadOptions read_options(Env::IOActivity::kFlush); + const WriteOptions write_options(Env::IOActivity::kFlush); size_t num = mems_list.size(); assert(cfds.size() == num); @@ -936,8 +942,8 @@ Status InstallMemtableAtomicFlushResults( } // this can release and reacquire the mutex. - s = vset->LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, - mu, db_directory); + s = vset->LogAndApply(cfds, mutable_cf_options_list, read_options, + write_options, edit_lists, mu, db_directory); for (size_t k = 0; k != cfds.size(); ++k) { auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); diff --git a/db/memtable_list.h b/db/memtable_list.h index f4f342ed544..08b9d658e19 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -112,10 +112,12 @@ class MemTableListVersion { RangeDelAggregator* range_del_agg); void AddIterators(const ReadOptions& options, + UnownedPtr seqno_to_time_mapping, std::vector* iterator_list, Arena* arena); void AddIterators(const ReadOptions& options, + UnownedPtr seqno_to_time_mapping, MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter); @@ -514,7 +516,7 @@ class MemTableList { // installs flush results for external immutable memtable lists other than the // cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case, // imm_lists parameter is not nullptr. -extern Status InstallMemtableAtomicFlushResults( +Status InstallMemtableAtomicFlushResults( const autovector* imm_lists, const autovector& cfds, const autovector& mutable_cf_options_list, diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index 9a5b7557f89..bb4e44761ef 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -18,8 +18,17 @@ #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/string_util.h" +#include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { +namespace { +std::string ValueWithWriteTime(std::string value, uint64_t write_time) { + std::string result; + result = value; + PutFixed64(&result, write_time); + return result; +} +} // namespace class MemTableListTest : public testing::Test { public: @@ -108,7 +117,7 @@ class MemTableListTest : public testing::Test { &write_controller, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr); + /*error_handler=*/nullptr, /*read_only=*/false); std::vector cf_descs; cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); cf_descs.emplace_back("one", ColumnFamilyOptions()); @@ -160,7 +169,7 @@ class MemTableListTest : public testing::Test { &write_controller, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr); + /*error_handler=*/nullptr, /*read_only=*/false); std::vector cf_descs; cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); cf_descs.emplace_back("one", ColumnFamilyOptions()); @@ -255,6 +264,7 @@ TEST_F(MemTableListTest, GetTest) { InternalKeyComparator cmp(BytewiseComparator()); auto factory = std::make_shared(); options.memtable_factory = factory; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); ImmutableOptions ioptions(options); WriteBufferManager wb(options.db_write_buffer_size); @@ -271,6 +281,9 @@ TEST_F(MemTableListTest, GetTest) { nullptr /* kv_prot_info */)); ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2", nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValuePreferredSeqno, "key3", + ValueWithWriteTime("value3.1", 20), + nullptr /* kv_prot_info */)); // Fetch the newly written keys merge_context.Clear(); @@ -297,7 +310,15 @@ TEST_F(MemTableListTest, GetTest) { ASSERT_TRUE(s.ok() && found); ASSERT_EQ(value, "value2.2"); - ASSERT_EQ(4, mem->num_entries()); + merge_context.Clear(); + found = mem->Get(LookupKey("key3", seq), &value, /*columns*/ nullptr, + /*timestamp*/ nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions(), + false /* immutable_memtable */); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ(value, "value3.1"); + + ASSERT_EQ(5, mem->num_entries()); ASSERT_EQ(1, mem->num_deletes()); // Add memtable to list @@ -318,6 +339,8 @@ TEST_F(MemTableListTest, GetTest) { mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); ASSERT_OK(mem2->Add(++seq, kTypeValue, "key2", "value2.3", nullptr /* kv_prot_info */)); + ASSERT_OK(mem2->Add(++seq, kTypeMerge, "key3", "value3.2", + nullptr /* kv_prot_info */)); // Add second memtable to list // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed()) @@ -355,6 +378,14 @@ TEST_F(MemTableListTest, GetTest) { &max_covering_tombstone_seq, ReadOptions()); ASSERT_FALSE(found); + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ(value, "value3.1,value3.2"); + ASSERT_EQ(2, list.NumNotFlushed()); list.current()->Unref(&to_delete); diff --git a/db/merge_context.h b/db/merge_context.h index 8a7b0729020..f0cd9633602 100644 --- a/db/merge_context.h +++ b/db/merge_context.h @@ -9,6 +9,7 @@ #include #include +#include "rocksdb/db.h" #include "rocksdb/slice.h" namespace ROCKSDB_NAMESPACE { @@ -21,6 +22,8 @@ const std::vector empty_operand_list; // will be fetched from the context when issuing partial of full merge. class MergeContext { public: + GetMergeOperandsOptions* get_merge_operands_options = nullptr; + // Clear all the operands void Clear() { if (operand_list_) { diff --git a/db/merge_helper.cc b/db/merge_helper.cc index d8b1d788bb5..2576aae840d 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -12,7 +12,6 @@ #include "db/blob/prefetch_buffer_collection.h" #include "db/compaction/compaction_iteration_stats.h" #include "db/dbformat.h" -#include "db/wide/wide_column_serialization.h" #include "db/wide/wide_columns_helper.h" #include "logging/logging.h" #include "monitoring/perf_context_imp.h" @@ -111,9 +110,9 @@ Status MergeHelper::TimedFullMergeImpl( const MergeOperator* merge_operator, const Slice& key, MergeOperator::MergeOperationInputV3::ExistingValue&& existing_value, const std::vector& operands, Logger* logger, Statistics* statistics, - SystemClock* clock, bool update_num_ops_stats, std::string* result, - Slice* result_operand, ValueType* result_type, - MergeOperator::OpFailureScope* op_failure_scope) { + SystemClock* clock, bool update_num_ops_stats, + MergeOperator::OpFailureScope* op_failure_scope, std::string* result, + Slice* result_operand, ValueType* result_type) { assert(result); assert(result_type); @@ -173,9 +172,9 @@ Status MergeHelper::TimedFullMergeImpl( const MergeOperator* merge_operator, const Slice& key, MergeOperator::MergeOperationInputV3::ExistingValue&& existing_value, const std::vector& operands, Logger* logger, Statistics* statistics, - SystemClock* clock, bool update_num_ops_stats, std::string* result_value, - PinnableWideColumns* result_entity, - MergeOperator::OpFailureScope* op_failure_scope) { + SystemClock* clock, bool update_num_ops_stats, + MergeOperator::OpFailureScope* op_failure_scope, std::string* result_value, + PinnableWideColumns* result_entity) { assert(result_value || result_entity); assert(!result_value || !result_entity); @@ -245,141 +244,6 @@ Status MergeHelper::TimedFullMergeImpl( op_failure_scope, std::move(visitor)); } -Status MergeHelper::TimedFullMerge( - const MergeOperator* merge_operator, const Slice& key, NoBaseValueTag, - const std::vector& operands, Logger* logger, Statistics* statistics, - SystemClock* clock, bool update_num_ops_stats, std::string* result, - Slice* result_operand, ValueType* result_type, - MergeOperator::OpFailureScope* op_failure_scope) { - MergeOperator::MergeOperationInputV3::ExistingValue existing_value; - - return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), - operands, logger, statistics, clock, - update_num_ops_stats, result, result_operand, - result_type, op_failure_scope); -} - -Status MergeHelper::TimedFullMerge( - const MergeOperator* merge_operator, const Slice& key, PlainBaseValueTag, - const Slice& value, const std::vector& operands, Logger* logger, - Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, - std::string* result, Slice* result_operand, ValueType* result_type, - MergeOperator::OpFailureScope* op_failure_scope) { - MergeOperator::MergeOperationInputV3::ExistingValue existing_value(value); - - return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), - operands, logger, statistics, clock, - update_num_ops_stats, result, result_operand, - result_type, op_failure_scope); -} - -Status MergeHelper::TimedFullMerge( - const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, - const Slice& entity, const std::vector& operands, Logger* logger, - Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, - std::string* result, Slice* result_operand, ValueType* result_type, - MergeOperator::OpFailureScope* op_failure_scope) { - MergeOperator::MergeOperationInputV3::ExistingValue existing_value; - - Slice entity_copy(entity); - WideColumns existing_columns; - - const Status s = - WideColumnSerialization::Deserialize(entity_copy, existing_columns); - if (!s.ok()) { - return s; - } - - existing_value = std::move(existing_columns); - - return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), - operands, logger, statistics, clock, - update_num_ops_stats, result, result_operand, - result_type, op_failure_scope); -} - -Status MergeHelper::TimedFullMerge( - const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, - const WideColumns& columns, const std::vector& operands, - Logger* logger, Statistics* statistics, SystemClock* clock, - bool update_num_ops_stats, std::string* result, Slice* result_operand, - ValueType* result_type, MergeOperator::OpFailureScope* op_failure_scope) { - MergeOperator::MergeOperationInputV3::ExistingValue existing_value(columns); - - return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), - operands, logger, statistics, clock, - update_num_ops_stats, result, result_operand, - result_type, op_failure_scope); -} - -Status MergeHelper::TimedFullMerge( - const MergeOperator* merge_operator, const Slice& key, NoBaseValueTag, - const std::vector& operands, Logger* logger, Statistics* statistics, - SystemClock* clock, bool update_num_ops_stats, std::string* result_value, - PinnableWideColumns* result_entity, - MergeOperator::OpFailureScope* op_failure_scope) { - MergeOperator::MergeOperationInputV3::ExistingValue existing_value; - - return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), - operands, logger, statistics, clock, - update_num_ops_stats, result_value, result_entity, - op_failure_scope); -} - -Status MergeHelper::TimedFullMerge( - const MergeOperator* merge_operator, const Slice& key, PlainBaseValueTag, - const Slice& value, const std::vector& operands, Logger* logger, - Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, - std::string* result_value, PinnableWideColumns* result_entity, - MergeOperator::OpFailureScope* op_failure_scope) { - MergeOperator::MergeOperationInputV3::ExistingValue existing_value(value); - - return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), - operands, logger, statistics, clock, - update_num_ops_stats, result_value, result_entity, - op_failure_scope); -} - -Status MergeHelper::TimedFullMerge( - const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, - const Slice& entity, const std::vector& operands, Logger* logger, - Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, - std::string* result_value, PinnableWideColumns* result_entity, - MergeOperator::OpFailureScope* op_failure_scope) { - MergeOperator::MergeOperationInputV3::ExistingValue existing_value; - - Slice entity_copy(entity); - WideColumns existing_columns; - - const Status s = - WideColumnSerialization::Deserialize(entity_copy, existing_columns); - if (!s.ok()) { - return s; - } - - existing_value = std::move(existing_columns); - - return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), - operands, logger, statistics, clock, - update_num_ops_stats, result_value, result_entity, - op_failure_scope); -} - -Status MergeHelper::TimedFullMerge( - const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, - const WideColumns& columns, const std::vector& operands, - Logger* logger, Statistics* statistics, SystemClock* clock, - bool update_num_ops_stats, std::string* result_value, - PinnableWideColumns* result_entity, - MergeOperator::OpFailureScope* op_failure_scope) { - MergeOperator::MergeOperationInputV3::ExistingValue existing_value(columns); - - return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), - operands, logger, statistics, clock, - update_num_ops_stats, result_value, result_entity, - op_failure_scope); -} - // PRE: iter points to the first merge type entry // POST: iter points to the first entry beyond the merge process (or the end) // keys_, operands_ are updated to reflect the merge result. @@ -428,7 +292,9 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, Status s = ParseInternalKey(original_key, &orig_ikey, allow_data_in_errors); assert(s.ok()); - if (!s.ok()) return s; + if (!s.ok()) { + return s; + } assert(kTypeMerge == orig_ikey.type); @@ -517,14 +383,26 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, s = TimedFullMerge(user_merge_operator_, ikey.user_key, kNoBaseValue, merge_context_.GetOperands(), logger_, stats_, clock_, /* update_num_ops_stats */ false, - &merge_result, /* result_operand */ nullptr, - &merge_result_type, &op_failure_scope); + &op_failure_scope, &merge_result, + /* result_operand */ nullptr, &merge_result_type); } else if (ikey.type == kTypeValue) { s = TimedFullMerge(user_merge_operator_, ikey.user_key, kPlainBaseValue, iter->value(), merge_context_.GetOperands(), logger_, stats_, clock_, /* update_num_ops_stats */ false, - &merge_result, /* result_operand */ nullptr, - &merge_result_type, &op_failure_scope); + &op_failure_scope, &merge_result, + /* result_operand */ nullptr, &merge_result_type); + } else if (ikey.type == kTypeValuePreferredSeqno) { + // When a TimedPut is merged with some merge operands, its original + // write time info is obsolete and removed, and the merge result is a + // kTypeValue. + Slice unpacked_value = ParsePackedValueForValue(iter->value()); + s = TimedFullMerge(user_merge_operator_, ikey.user_key, kPlainBaseValue, + unpacked_value, merge_context_.GetOperands(), + logger_, stats_, clock_, + /* update_num_ops_stats */ false, &op_failure_scope, + &merge_result, + /* result_operand */ nullptr, &merge_result_type); + } else if (ikey.type == kTypeBlobIndex) { BlobIndex blob_index; @@ -557,20 +435,20 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, s = TimedFullMerge(user_merge_operator_, ikey.user_key, kPlainBaseValue, blob_value, merge_context_.GetOperands(), logger_, stats_, clock_, /* update_num_ops_stats */ false, - &merge_result, /* result_operand */ nullptr, - &merge_result_type, &op_failure_scope); + &op_failure_scope, &merge_result, + /* result_operand */ nullptr, &merge_result_type); } else if (ikey.type == kTypeWideColumnEntity) { s = TimedFullMerge(user_merge_operator_, ikey.user_key, kWideBaseValue, iter->value(), merge_context_.GetOperands(), logger_, stats_, clock_, /* update_num_ops_stats */ false, - &merge_result, /* result_operand */ nullptr, - &merge_result_type, &op_failure_scope); + &op_failure_scope, &merge_result, + /* result_operand */ nullptr, &merge_result_type); } else { s = TimedFullMerge(user_merge_operator_, ikey.user_key, kNoBaseValue, merge_context_.GetOperands(), logger_, stats_, clock_, /* update_num_ops_stats */ false, - &merge_result, /* result_operand */ nullptr, - &merge_result_type, &op_failure_scope); + &op_failure_scope, &merge_result, + /* result_operand */ nullptr, &merge_result_type); } // We store the result in keys_.back() and operands_.back() @@ -712,9 +590,9 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, MergeOperator::OpFailureScope op_failure_scope; s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, kNoBaseValue, merge_context_.GetOperands(), logger_, stats_, clock_, - /* update_num_ops_stats */ false, &merge_result, - /* result_operand */ nullptr, &merge_result_type, - &op_failure_scope); + /* update_num_ops_stats */ false, &op_failure_scope, + &merge_result, + /* result_operand */ nullptr, &merge_result_type); if (s.ok()) { // The original key encountered // We are certain that keys_ is not empty here (see assertions couple of diff --git a/db/merge_helper.h b/db/merge_helper.h index 84c5f35351f..39bd15f6087 100644 --- a/db/merge_helper.h +++ b/db/merge_helper.h @@ -12,6 +12,7 @@ #include "db/merge_context.h" #include "db/range_del_aggregator.h" #include "db/snapshot_checker.h" +#include "db/wide/wide_column_serialization.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" @@ -60,74 +61,73 @@ class MergeHelper { struct WideBaseValueTag {}; static constexpr WideBaseValueTag kWideBaseValue{}; - // Variants that expose the merge result directly (in serialized form for wide - // columns) as well as its value type. Used by iterator and compaction. + template static Status TimedFullMerge(const MergeOperator* merge_operator, const Slice& key, NoBaseValueTag, const std::vector& operands, Logger* logger, Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, - std::string* result, Slice* result_operand, - ValueType* result_type, - MergeOperator::OpFailureScope* op_failure_scope); + MergeOperator::OpFailureScope* op_failure_scope, + ResultTs... results) { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value; + + return TimedFullMergeImpl( + merge_operator, key, std::move(existing_value), operands, logger, + statistics, clock, update_num_ops_stats, op_failure_scope, results...); + } + template static Status TimedFullMerge( const MergeOperator* merge_operator, const Slice& key, PlainBaseValueTag, const Slice& value, const std::vector& operands, Logger* logger, Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, - std::string* result, Slice* result_operand, ValueType* result_type, - MergeOperator::OpFailureScope* op_failure_scope); + MergeOperator::OpFailureScope* op_failure_scope, ResultTs... results) { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(value); + + return TimedFullMergeImpl( + merge_operator, key, std::move(existing_value), operands, logger, + statistics, clock, update_num_ops_stats, op_failure_scope, results...); + } + template static Status TimedFullMerge( const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, const Slice& entity, const std::vector& operands, Logger* logger, Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, - std::string* result, Slice* result_operand, ValueType* result_type, - MergeOperator::OpFailureScope* op_failure_scope); + MergeOperator::OpFailureScope* op_failure_scope, ResultTs... results) { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value; - static Status TimedFullMerge( - const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, - const WideColumns& columns, const std::vector& operands, - Logger* logger, Statistics* statistics, SystemClock* clock, - bool update_num_ops_stats, std::string* result, Slice* result_operand, - ValueType* result_type, MergeOperator::OpFailureScope* op_failure_scope); + Slice entity_copy(entity); + WideColumns existing_columns; - // Variants that expose the merge result translated to the form requested by - // the client. (For example, if the result is a wide-column structure but the - // client requested the results in plain-value form, the value of the default - // column is returned.) Used by point lookups. - static Status TimedFullMerge(const MergeOperator* merge_operator, - const Slice& key, NoBaseValueTag, - const std::vector& operands, - Logger* logger, Statistics* statistics, - SystemClock* clock, bool update_num_ops_stats, - std::string* result_value, - PinnableWideColumns* result_entity, - MergeOperator::OpFailureScope* op_failure_scope); + const Status s = + WideColumnSerialization::Deserialize(entity_copy, existing_columns); + if (!s.ok()) { + return s; + } - static Status TimedFullMerge( - const MergeOperator* merge_operator, const Slice& key, PlainBaseValueTag, - const Slice& value, const std::vector& operands, Logger* logger, - Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, - std::string* result_value, PinnableWideColumns* result_entity, - MergeOperator::OpFailureScope* op_failure_scope); + existing_value = std::move(existing_columns); - static Status TimedFullMerge( - const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, - const Slice& entity, const std::vector& operands, Logger* logger, - Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, - std::string* result_value, PinnableWideColumns* result_entity, - MergeOperator::OpFailureScope* op_failure_scope); + return TimedFullMergeImpl( + merge_operator, key, std::move(existing_value), operands, logger, + statistics, clock, update_num_ops_stats, op_failure_scope, results...); + } + template static Status TimedFullMerge(const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, const WideColumns& columns, const std::vector& operands, Logger* logger, Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, - std::string* result_value, - PinnableWideColumns* result_entity, - MergeOperator::OpFailureScope* op_failure_scope); + MergeOperator::OpFailureScope* op_failure_scope, + ResultTs... results) { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(columns); + + return TimedFullMergeImpl( + merge_operator, key, std::move(existing_value), operands, logger, + statistics, clock, update_num_ops_stats, op_failure_scope, results...); + } // During compaction, merge entries until we hit // - a corrupted key @@ -271,21 +271,27 @@ class MergeHelper { Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, MergeOperator::OpFailureScope* op_failure_scope, Visitor&& visitor); + // Variant that exposes the merge result directly (in serialized form for wide + // columns) as well as its value type. Used by iterator and compaction. static Status TimedFullMergeImpl( const MergeOperator* merge_operator, const Slice& key, MergeOperator::MergeOperationInputV3::ExistingValue&& existing_value, const std::vector& operands, Logger* logger, Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, - std::string* result, Slice* result_operand, ValueType* result_type, - MergeOperator::OpFailureScope* op_failure_scope); + MergeOperator::OpFailureScope* op_failure_scope, std::string* result, + Slice* result_operand, ValueType* result_type); + // Variant that exposes the merge result translated into the form requested by + // the client. (For example, if the result is a wide-column structure but the + // client requested the results in plain-value form, the value of the default + // column is returned.) Used by point lookups. static Status TimedFullMergeImpl( const MergeOperator* merge_operator, const Slice& key, MergeOperator::MergeOperationInputV3::ExistingValue&& existing_value, const std::vector& operands, Logger* logger, Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, - std::string* result_value, PinnableWideColumns* result_entity, - MergeOperator::OpFailureScope* op_failure_scope); + MergeOperator::OpFailureScope* op_failure_scope, + std::string* result_value, PinnableWideColumns* result_entity); }; // MergeOutputIterator can be used to iterate over the result of a merge. diff --git a/db/merge_test.cc b/db/merge_test.cc index 93a8535a7ee..da55fc94f32 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -3,8 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include - +#include #include #include @@ -87,7 +86,9 @@ class EnvMergeTest : public EnvWrapper { static std::unique_ptr singleton_; static EnvMergeTest* GetInstance() { - if (nullptr == singleton_) singleton_.reset(new EnvMergeTest); + if (nullptr == singleton_) { + singleton_.reset(new EnvMergeTest); + } return singleton_.get(); } }; @@ -145,7 +146,7 @@ class Counters { assert(db_); } - virtual ~Counters() {} + virtual ~Counters() = default; // public interface of Counters. // All four functions return false @@ -194,7 +195,7 @@ class Counters { std::cerr << "value corruption\n"; return false; } - *value = DecodeFixed64(&str[0]); + *value = DecodeFixed64(str.data()); return true; } else { std::cerr << s.ToString() << std::endl; @@ -220,14 +221,18 @@ class Counters { uint64_t value = default_; int result = get(key, &value); assert(result); - if (result == 0) exit(1); // Disable unused variable warning. + if (result == 0) { + exit(1); // Disable unused variable warning. + } return value; } void assert_add(const std::string& key, uint64_t value) { int result = add(key, value); assert(result); - if (result == 0) exit(1); // Disable unused variable warning. + if (result == 0) { + exit(1); // Disable unused variable warning. + } } }; @@ -349,7 +354,7 @@ void testCountersWithFlushAndCompaction(Counters& counters, DB* db) { }); SyncPoint::GetInstance()->SetCallBack( "VersionSet::LogAndApply:WakeUpAndDone", [&](void* arg) { - auto* mutex = reinterpret_cast(arg); + auto* mutex = static_cast(arg); mutex->AssertHeld(); int thread_id = get_thread_id(); ASSERT_EQ(2, thread_id); @@ -375,12 +380,12 @@ void testCountersWithFlushAndCompaction(Counters& counters, DB* db) { SyncPoint::GetInstance()->EnableProcessing(); port::Thread set_options_thread([&]() { - ASSERT_OK(reinterpret_cast(db)->SetOptions( + ASSERT_OK(static_cast(db)->SetOptions( {{"disable_auto_compactions", "false"}})); }); TEST_SYNC_POINT("testCountersWithCompactionAndFlush:BeforeCompact"); port::Thread compact_thread([&]() { - ASSERT_OK(reinterpret_cast(db)->CompactRange( + ASSERT_OK(static_cast(db)->CompactRange( CompactRangeOptions(), db->DefaultColumnFamily(), nullptr, nullptr)); }); @@ -496,7 +501,7 @@ void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges, std::string get_value_str; ASSERT_OK(db->Get(ReadOptions(), key, &get_value_str)); assert(get_value_str.size() == sizeof(uint64_t)); - uint64_t get_value = DecodeFixed64(&get_value_str[0]); + uint64_t get_value = DecodeFixed64(get_value_str.data()); ASSERT_EQ(get_value, num_merges * merge_value); ASSERT_EQ(num_merge_operator_calls, static_cast((num_merges % (max_num_merges + 1)))); diff --git a/db/multi_cf_iterator.cc b/db/multi_cf_iterator.cc new file mode 100644 index 00000000000..80e4171d54d --- /dev/null +++ b/db/multi_cf_iterator.cc @@ -0,0 +1,102 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/multi_cf_iterator.h" + +#include + +namespace ROCKSDB_NAMESPACE { + +template +void MultiCfIterator::SeekCommon(BinaryHeap& heap, + ChildSeekFuncType child_seek_func) { + heap.clear(); + int i = 0; + for (auto& cfh_iter_pair : cfh_iter_pairs_) { + auto& cfh = cfh_iter_pair.first; + auto& iter = cfh_iter_pair.second; + child_seek_func(iter.get()); + if (iter->Valid()) { + assert(iter->status().ok()); + heap.push(MultiCfIteratorInfo{iter.get(), cfh, i}); + } else { + considerStatus(iter->status()); + } + ++i; + } +} + +template +void MultiCfIterator::AdvanceIterator(BinaryHeap& heap, + AdvanceFuncType advance_func) { + // 1. Keep the top iterator (by popping it from the heap) + // 2. Make sure all others have iterated past the top iterator key slice + // 3. Advance the top iterator, and add it back to the heap if valid + auto top = heap.top(); + heap.pop(); + if (!heap.empty()) { + auto* current = heap.top().iterator; + while (current->Valid() && + comparator_->Compare(top.iterator->key(), current->key()) == 0) { + assert(current->status().ok()); + advance_func(current); + if (current->Valid()) { + heap.replace_top(heap.top()); + } else { + considerStatus(current->status()); + heap.pop(); + } + if (!heap.empty()) { + current = heap.top().iterator; + } + } + } + advance_func(top.iterator); + if (top.iterator->Valid()) { + assert(top.iterator->status().ok()); + heap.push(top); + } else { + considerStatus(top.iterator->status()); + } +} + +void MultiCfIterator::SeekToFirst() { + auto& min_heap = GetHeap([this]() { InitMinHeap(); }); + SeekCommon(min_heap, [](Iterator* iter) { iter->SeekToFirst(); }); +} +void MultiCfIterator::Seek(const Slice& target) { + auto& min_heap = GetHeap([this]() { InitMinHeap(); }); + SeekCommon(min_heap, [&target](Iterator* iter) { iter->Seek(target); }); +} +void MultiCfIterator::SeekToLast() { + auto& max_heap = GetHeap([this]() { InitMaxHeap(); }); + SeekCommon(max_heap, [](Iterator* iter) { iter->SeekToLast(); }); +} +void MultiCfIterator::SeekForPrev(const Slice& target) { + auto& max_heap = GetHeap([this]() { InitMaxHeap(); }); + SeekCommon(max_heap, + [&target](Iterator* iter) { iter->SeekForPrev(target); }); +} + +void MultiCfIterator::Next() { + assert(Valid()); + auto& min_heap = GetHeap([this]() { + Slice target = key(); + InitMinHeap(); + Seek(target); + }); + AdvanceIterator(min_heap, [](Iterator* iter) { iter->Next(); }); +} +void MultiCfIterator::Prev() { + assert(Valid()); + auto& max_heap = GetHeap([this]() { + Slice target = key(); + InitMaxHeap(); + SeekForPrev(target); + }); + AdvanceIterator(max_heap, [](Iterator* iter) { iter->Prev(); }); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/multi_cf_iterator.h b/db/multi_cf_iterator.h new file mode 100644 index 00000000000..cdd09c16df0 --- /dev/null +++ b/db/multi_cf_iterator.h @@ -0,0 +1,159 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "util/heap.h" +#include "util/overload.h" + +namespace ROCKSDB_NAMESPACE { + +// UNDER CONSTRUCTION - DO NOT USE +// A cross-column-family iterator from a consistent database state. +// When the same key exists in more than one column families, the iterator +// selects the value from the first column family containing the key, in the +// order provided in the `column_families` parameter. +class MultiCfIterator : public Iterator { + public: + MultiCfIterator(const Comparator* comparator, + const std::vector& column_families, + const std::vector& child_iterators) + : comparator_(comparator), + heap_(MultiCfMinHeap( + MultiCfHeapItemComparator>(comparator_))) { + assert(column_families.size() > 0 && + column_families.size() == child_iterators.size()); + cfh_iter_pairs_.reserve(column_families.size()); + for (size_t i = 0; i < column_families.size(); ++i) { + cfh_iter_pairs_.emplace_back( + column_families[i], std::unique_ptr(child_iterators[i])); + } + } + ~MultiCfIterator() override { status_.PermitUncheckedError(); } + + // No copy allowed + MultiCfIterator(const MultiCfIterator&) = delete; + MultiCfIterator& operator=(const MultiCfIterator&) = delete; + + private: + std::vector>> + cfh_iter_pairs_; + ReadOptions read_options_; + Status status_; + + AttributeGroups attribute_groups_; + + struct MultiCfIteratorInfo { + Iterator* iterator; + ColumnFamilyHandle* cfh; + int order; + }; + + template + class MultiCfHeapItemComparator { + public: + explicit MultiCfHeapItemComparator(const Comparator* comparator) + : comparator_(comparator) {} + bool operator()(const MultiCfIteratorInfo& a, + const MultiCfIteratorInfo& b) const { + assert(a.iterator); + assert(b.iterator); + assert(a.iterator->Valid()); + assert(b.iterator->Valid()); + int c = comparator_->Compare(a.iterator->key(), b.iterator->key()); + assert(c != 0 || a.order != b.order); + return c == 0 ? a.order - b.order > 0 : CompareOp()(c, 0); + } + + private: + const Comparator* comparator_; + }; + const Comparator* comparator_; + using MultiCfMinHeap = + BinaryHeap>>; + using MultiCfMaxHeap = BinaryHeap>>; + + using MultiCfIterHeap = std::variant; + + MultiCfIterHeap heap_; + + // TODO: Lower and Upper bounds + + Iterator* current() const { + if (std::holds_alternative(heap_)) { + auto& max_heap = std::get(heap_); + return max_heap.top().iterator; + } + auto& min_heap = std::get(heap_); + return min_heap.top().iterator; + } + + Slice key() const override { + assert(Valid()); + return current()->key(); + } + Slice value() const override { + assert(Valid()); + return current()->value(); + } + const WideColumns& columns() const override { + assert(Valid()); + return current()->columns(); + } + + bool Valid() const override { + if (std::holds_alternative(heap_)) { + auto& max_heap = std::get(heap_); + return !max_heap.empty() && status_.ok(); + } + auto& min_heap = std::get(heap_); + return !min_heap.empty() && status_.ok(); + } + + Status status() const override { return status_; } + void considerStatus(Status s) { + if (!s.ok() && status_.ok()) { + status_ = std::move(s); + } + } + + template + HeapType& GetHeap(InitFunc initFunc) { + if (!std::holds_alternative(heap_)) { + initFunc(); + } + return std::get(heap_); + } + + void InitMinHeap() { + heap_.emplace( + MultiCfHeapItemComparator>(comparator_)); + } + void InitMaxHeap() { + heap_.emplace( + MultiCfHeapItemComparator>(comparator_)); + } + + template + void SeekCommon(BinaryHeap& heap, ChildSeekFuncType child_seek_func); + template + void AdvanceIterator(BinaryHeap& heap, AdvanceFuncType advance_func); + + void SeekToFirst() override; + void SeekToLast() override; + void Seek(const Slice& /*target*/) override; + void SeekForPrev(const Slice& /*target*/) override; + void Next() override; + void Prev() override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/multi_cf_iterator_test.cc b/db/multi_cf_iterator_test.cc new file mode 100644 index 00000000000..f4d146ca14a --- /dev/null +++ b/db/multi_cf_iterator_test.cc @@ -0,0 +1,497 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "db/db_test_util.h" + +namespace ROCKSDB_NAMESPACE { + +class MultiCfIteratorTest : public DBTestBase { + public: + MultiCfIteratorTest() + : DBTestBase("multi_cf_iterator_test", /*env_do_fsync=*/true) {} + + // Verify Iteration of MultiCfIterator + // by SeekToFirst() + Next() and SeekToLast() + Prev() + void verifyMultiCfIterator( + const std::vector& cfhs, + const std::vector& expected_keys, + const std::optional>& expected_values = std::nullopt, + const std::optional>& expected_wide_columns = + std::nullopt, + const std::optional>& + expected_attribute_groups = std::nullopt) { + int i = 0; + std::unique_ptr iter = + db_->NewMultiCfIterator(ReadOptions(), cfhs); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(expected_keys[i], iter->key()); + if (expected_values.has_value()) { + ASSERT_EQ(expected_values.value()[i], iter->value()); + } + if (expected_wide_columns.has_value()) { + ASSERT_EQ(expected_wide_columns.value()[i], iter->columns()); + } + if (expected_attribute_groups.has_value()) { + // TODO - Add this back when attribute_groups() API is added + // ASSERT_EQ(expected_attribute_groups.value()[i], + // iter->attribute_groups()); + } + ++i; + } + ASSERT_EQ(expected_keys.size(), i); + ASSERT_OK(iter->status()); + + int rev_i = i - 1; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_EQ(expected_keys[rev_i], iter->key()); + if (expected_values.has_value()) { + ASSERT_EQ(expected_values.value()[rev_i], iter->value()); + } + if (expected_wide_columns.has_value()) { + ASSERT_EQ(expected_wide_columns.value()[rev_i], iter->columns()); + } + if (expected_attribute_groups.has_value()) { + // TODO - Add this back when attribute_groups() API is added + // ASSERT_EQ(expected_attribute_groups.value()[rev_i], + // iter->attribute_groups()); + } + rev_i--; + } + ASSERT_OK(iter->status()); + } + + void verifyExpectedKeys(ColumnFamilyHandle* cfh, + const std::vector& expected_keys) { + int i = 0; + Iterator* iter = db_->NewIterator(ReadOptions(), cfh); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(expected_keys[i], iter->key()); + ++i; + } + ASSERT_EQ(expected_keys.size(), i); + ASSERT_OK(iter->status()); + delete iter; + } +}; + +TEST_F(MultiCfIteratorTest, InvalidArguments) { + Options options = GetDefaultOptions(); + { + CreateAndReopenWithCF({"cf_1", "cf_2", "cf_3"}, options); + + // Invalid - No CF is provided + std::unique_ptr iter_with_no_cf = + db_->NewMultiCfIterator(ReadOptions(), {}); + ASSERT_NOK(iter_with_no_cf->status()); + ASSERT_TRUE(iter_with_no_cf->status().IsInvalidArgument()); + } +} + +TEST_F(MultiCfIteratorTest, SimpleValues) { + Options options = GetDefaultOptions(); + { + // Case 1: Unique key per CF + CreateAndReopenWithCF({"cf_1", "cf_2", "cf_3"}, options); + + ASSERT_OK(Put(0, "key_1", "key_1_cf_0_val")); + ASSERT_OK(Put(1, "key_2", "key_2_cf_1_val")); + ASSERT_OK(Put(2, "key_3", "key_3_cf_2_val")); + ASSERT_OK(Put(3, "key_4", "key_4_cf_3_val")); + + std::vector expected_keys = {"key_1", "key_2", "key_3", "key_4"}; + std::vector expected_values = {"key_1_cf_0_val", "key_2_cf_1_val", + "key_3_cf_2_val", "key_4_cf_3_val"}; + + // Test for iteration over CF default->1->2->3 + std::vector cfhs_order_0_1_2_3 = { + handles_[0], handles_[1], handles_[2], handles_[3]}; + verifyMultiCfIterator(cfhs_order_0_1_2_3, expected_keys, expected_values); + + // Test for iteration over CF 3->1->default_cf->2 + std::vector cfhs_order_3_1_0_2 = { + handles_[3], handles_[1], handles_[0], handles_[2]}; + // Iteration order and the return values should be the same since keys are + // unique per CF + verifyMultiCfIterator(cfhs_order_3_1_0_2, expected_keys, expected_values); + + // Verify Seek() + { + std::unique_ptr iter = + db_->NewMultiCfIterator(ReadOptions(), cfhs_order_0_1_2_3); + iter->Seek(""); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_0_val"); + iter->Seek("key_1"); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_0_val"); + iter->Seek("key_2"); + ASSERT_EQ(IterStatus(iter.get()), "key_2->key_2_cf_1_val"); + iter->Next(); + ASSERT_EQ(IterStatus(iter.get()), "key_3->key_3_cf_2_val"); + iter->Seek("key_x"); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + } + // Verify SeekForPrev() + { + std::unique_ptr iter = + db_->NewMultiCfIterator(ReadOptions(), cfhs_order_0_1_2_3); + iter->SeekForPrev(""); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + iter->SeekForPrev("key_1"); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_0_val"); + iter->Next(); + ASSERT_EQ(IterStatus(iter.get()), "key_2->key_2_cf_1_val"); + iter->SeekForPrev("key_x"); + ASSERT_EQ(IterStatus(iter.get()), "key_4->key_4_cf_3_val"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter.get()), "key_3->key_3_cf_2_val"); + iter->Next(); + ASSERT_EQ(IterStatus(iter.get()), "key_4->key_4_cf_3_val"); + iter->Next(); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + } + } + { + // Case 2: Same key in multiple CFs + options = CurrentOptions(options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"cf_1", "cf_2", "cf_3"}, options); + + ASSERT_OK(Put(0, "key_1", "key_1_cf_0_val")); + ASSERT_OK(Put(3, "key_1", "key_1_cf_3_val")); + ASSERT_OK(Put(1, "key_2", "key_2_cf_1_val")); + ASSERT_OK(Put(2, "key_2", "key_2_cf_2_val")); + ASSERT_OK(Put(0, "key_3", "key_3_cf_0_val")); + ASSERT_OK(Put(1, "key_3", "key_3_cf_1_val")); + ASSERT_OK(Put(3, "key_3", "key_3_cf_3_val")); + + std::vector expected_keys = {"key_1", "key_2", "key_3"}; + + // Test for iteration over CFs default->1->2->3 + std::vector cfhs_order_0_1_2_3 = { + handles_[0], handles_[1], handles_[2], handles_[3]}; + std::vector expected_values = {"key_1_cf_0_val", "key_2_cf_1_val", + "key_3_cf_0_val"}; + verifyMultiCfIterator(cfhs_order_0_1_2_3, expected_keys, expected_values); + + // Test for iteration over CFs 3->2->default_cf->1 + std::vector cfhs_order_3_2_0_1 = { + handles_[3], handles_[2], handles_[0], handles_[1]}; + expected_values = {"key_1_cf_3_val", "key_2_cf_2_val", "key_3_cf_3_val"}; + verifyMultiCfIterator(cfhs_order_3_2_0_1, expected_keys, expected_values); + + // Verify Seek() + { + std::unique_ptr iter = + db_->NewMultiCfIterator(ReadOptions(), cfhs_order_3_2_0_1); + iter->Seek(""); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_3_val"); + iter->Seek("key_1"); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_3_val"); + iter->Seek("key_2"); + ASSERT_EQ(IterStatus(iter.get()), "key_2->key_2_cf_2_val"); + iter->Next(); + ASSERT_EQ(IterStatus(iter.get()), "key_3->key_3_cf_3_val"); + iter->Seek("key_x"); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + } + // Verify SeekForPrev() + { + std::unique_ptr iter = + db_->NewMultiCfIterator(ReadOptions(), cfhs_order_3_2_0_1); + iter->SeekForPrev(""); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + iter->SeekForPrev("key_1"); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_3_val"); + iter->Next(); + ASSERT_EQ(IterStatus(iter.get()), "key_2->key_2_cf_2_val"); + iter->SeekForPrev("key_x"); + ASSERT_EQ(IterStatus(iter.get()), "key_3->key_3_cf_3_val"); + iter->Next(); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + } + } +} + +TEST_F(MultiCfIteratorTest, EmptyCfs) { + Options options = GetDefaultOptions(); + { + // Case 1: No keys in any of the CFs + CreateAndReopenWithCF({"cf_1", "cf_2", "cf_3"}, options); + std::unique_ptr iter = + db_->NewMultiCfIterator(ReadOptions(), handles_); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + iter->Seek("foo"); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + iter->SeekForPrev("foo"); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + + ASSERT_OK(iter->status()); + } + { + // Case 2: A single key exists in only one of the CF. Rest CFs are empty. + ASSERT_OK(Put(1, "key_1", "key_1_cf_1_val")); + std::unique_ptr iter = + db_->NewMultiCfIterator(ReadOptions(), handles_); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_1_val"); + iter->Next(); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_1_val"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + } + { + // Case 3: same key exists in all of the CFs except one (cf_2) + ASSERT_OK(Put(0, "key_1", "key_1_cf_0_val")); + ASSERT_OK(Put(3, "key_1", "key_1_cf_3_val")); + // handles_ are in the order of 0->1->2->3. We should expect value from cf_0 + std::unique_ptr iter = + db_->NewMultiCfIterator(ReadOptions(), handles_); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_0_val"); + iter->Next(); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_0_val"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + } +} + +TEST_F(MultiCfIteratorTest, WideColumns) { + // Set up the DB and Column Families + Options options = GetDefaultOptions(); + CreateAndReopenWithCF({"cf_1", "cf_2", "cf_3"}, options); + + constexpr char key_1[] = "key_1"; + WideColumns key_1_columns_in_cf_2{ + {kDefaultWideColumnName, "cf_2_col_val_0_key_1"}, + {"cf_2_col_name_1", "cf_2_col_val_1_key_1"}, + {"cf_2_col_name_2", "cf_2_col_val_2_key_1"}}; + WideColumns key_1_columns_in_cf_3{ + {"cf_3_col_name_1", "cf_3_col_val_1_key_1"}, + {"cf_3_col_name_2", "cf_3_col_val_2_key_1"}, + {"cf_3_col_name_3", "cf_3_col_val_3_key_1"}}; + + constexpr char key_2[] = "key_2"; + WideColumns key_2_columns_in_cf_1{ + {"cf_1_col_name_1", "cf_1_col_val_1_key_2"}}; + WideColumns key_2_columns_in_cf_2{ + {"cf_2_col_name_1", "cf_2_col_val_1_key_2"}, + {"cf_2_col_name_2", "cf_2_col_val_2_key_2"}}; + + constexpr char key_3[] = "key_3"; + WideColumns key_3_columns_in_cf_1{ + {"cf_1_col_name_1", "cf_1_col_val_1_key_3"}}; + WideColumns key_3_columns_in_cf_3{ + {"cf_3_col_name_1", "cf_3_col_val_1_key_3"}}; + + constexpr char key_4[] = "key_4"; + WideColumns key_4_columns_in_cf_0{ + {"cf_0_col_name_1", "cf_0_col_val_1_key_4"}}; + WideColumns key_4_columns_in_cf_2{ + {"cf_2_col_name_1", "cf_2_col_val_1_key_4"}}; + + // Use AttributeGroup PutEntity API to insert them together + AttributeGroups key_1_attribute_groups{ + AttributeGroup(handles_[2], key_1_columns_in_cf_2), + AttributeGroup(handles_[3], key_1_columns_in_cf_3)}; + AttributeGroups key_2_attribute_groups{ + AttributeGroup(handles_[1], key_2_columns_in_cf_1), + AttributeGroup(handles_[2], key_2_columns_in_cf_2)}; + AttributeGroups key_3_attribute_groups{ + AttributeGroup(handles_[1], key_3_columns_in_cf_1), + AttributeGroup(handles_[3], key_3_columns_in_cf_3)}; + AttributeGroups key_4_attribute_groups{ + AttributeGroup(handles_[0], key_4_columns_in_cf_0), + AttributeGroup(handles_[2], key_4_columns_in_cf_2)}; + + ASSERT_OK(db_->PutEntity(WriteOptions(), key_1, key_1_attribute_groups)); + ASSERT_OK(db_->PutEntity(WriteOptions(), key_2, key_2_attribute_groups)); + ASSERT_OK(db_->PutEntity(WriteOptions(), key_3, key_3_attribute_groups)); + ASSERT_OK(db_->PutEntity(WriteOptions(), key_4, key_4_attribute_groups)); + + // Test for iteration over CF default->1->2->3 + std::vector cfhs_order_0_1_2_3 = { + handles_[0], handles_[1], handles_[2], handles_[3]}; + std::vector expected_keys = {key_1, key_2, key_3, key_4}; + // Pick what DBIter would return for value() in the first CF that key exists + // Since value for kDefaultWideColumnName only exists for key_1, rest will + // return empty value + std::vector expected_values = {"cf_2_col_val_0_key_1", "", "", ""}; + + // Pick columns from the first CF that the key exists and value is stored as + // wide column + std::vector expected_wide_columns = { + {{kDefaultWideColumnName, "cf_2_col_val_0_key_1"}, + {"cf_2_col_name_1", "cf_2_col_val_1_key_1"}, + {"cf_2_col_name_2", "cf_2_col_val_2_key_1"}}, + {{"cf_1_col_name_1", "cf_1_col_val_1_key_2"}}, + {{"cf_1_col_name_1", "cf_1_col_val_1_key_3"}}, + {{"cf_0_col_name_1", "cf_0_col_val_1_key_4"}}}; + verifyMultiCfIterator(cfhs_order_0_1_2_3, expected_keys, expected_values, + expected_wide_columns); +} + +TEST_F(MultiCfIteratorTest, DifferentComparatorsInMultiCFs) { + // This test creates two column families with two different comparators. + // Attempting to create the MultiCFIterator should fail. + Options options = GetDefaultOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + options.comparator = BytewiseComparator(); + CreateColumnFamilies({"cf_forward"}, options); + options.comparator = ReverseBytewiseComparator(); + CreateColumnFamilies({"cf_reverse"}, options); + + ASSERT_OK(Put(0, "key_1", "value_1")); + ASSERT_OK(Put(0, "key_2", "value_2")); + ASSERT_OK(Put(0, "key_3", "value_3")); + ASSERT_OK(Put(1, "key_1", "value_1")); + ASSERT_OK(Put(1, "key_2", "value_2")); + ASSERT_OK(Put(1, "key_3", "value_3")); + + verifyExpectedKeys(handles_[0], {"key_1", "key_2", "key_3"}); + verifyExpectedKeys(handles_[1], {"key_3", "key_2", "key_1"}); + + std::unique_ptr iter = + db_->NewMultiCfIterator(ReadOptions(), handles_); + ASSERT_NOK(iter->status()); + ASSERT_TRUE(iter->status().IsInvalidArgument()); +} + +TEST_F(MultiCfIteratorTest, CustomComparatorsInMultiCFs) { + // This test creates two column families with the same custom test + // comparators (but instantiated independently). Attempting to create the + // MultiCFIterator should not fail. + Options options = GetDefaultOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + static auto comparator_1 = + std::make_unique( + test::SimpleSuffixReverseComparator()); + static auto comparator_2 = + std::make_unique( + test::SimpleSuffixReverseComparator()); + ASSERT_NE(comparator_1, comparator_2); + + options.comparator = comparator_1.get(); + CreateColumnFamilies({"cf_1"}, options); + options.comparator = comparator_2.get(); + CreateColumnFamilies({"cf_2"}, options); + + ASSERT_OK(Put(0, "key_001_001", "value_0_3")); + ASSERT_OK(Put(0, "key_001_002", "value_0_2")); + ASSERT_OK(Put(0, "key_001_003", "value_0_1")); + ASSERT_OK(Put(0, "key_002_001", "value_0_6")); + ASSERT_OK(Put(0, "key_002_002", "value_0_5")); + ASSERT_OK(Put(0, "key_002_003", "value_0_4")); + ASSERT_OK(Put(1, "key_001_001", "value_1_3")); + ASSERT_OK(Put(1, "key_001_002", "value_1_2")); + ASSERT_OK(Put(1, "key_001_003", "value_1_1")); + ASSERT_OK(Put(1, "key_003_004", "value_1_6")); + ASSERT_OK(Put(1, "key_003_005", "value_1_5")); + ASSERT_OK(Put(1, "key_003_006", "value_1_4")); + + verifyExpectedKeys( + handles_[0], {"key_001_003", "key_001_002", "key_001_001", "key_002_003", + "key_002_002", "key_002_001"}); + verifyExpectedKeys( + handles_[1], {"key_001_003", "key_001_002", "key_001_001", "key_003_006", + "key_003_005", "key_003_004"}); + + std::vector expected_keys = { + "key_001_003", "key_001_002", "key_001_001", "key_002_003", "key_002_002", + "key_002_001", "key_003_006", "key_003_005", "key_003_004"}; + std::vector expected_values = {"value_0_1", "value_0_2", "value_0_3", + "value_0_4", "value_0_5", "value_0_6", + "value_1_4", "value_1_5", "value_1_6"}; + int i = 0; + std::unique_ptr iter = + db_->NewMultiCfIterator(ReadOptions(), handles_); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(expected_keys[i], iter->key()); + ASSERT_EQ(expected_values[i], iter->value()); + ++i; + } + ASSERT_OK(iter->status()); +} + +TEST_F(MultiCfIteratorTest, DISABLED_IterateAttributeGroups) { + // Set up the DB and Column Families + Options options = GetDefaultOptions(); + CreateAndReopenWithCF({"cf_1", "cf_2", "cf_3"}, options); + + constexpr char key_1[] = "key_1"; + WideColumns key_1_columns_in_cf_2{ + {kDefaultWideColumnName, "cf_2_col_val_0_key_1"}, + {"cf_2_col_name_1", "cf_2_col_val_1_key_1"}, + {"cf_2_col_name_2", "cf_2_col_val_2_key_1"}}; + WideColumns key_1_columns_in_cf_3{ + {"cf_3_col_name_1", "cf_3_col_val_1_key_1"}, + {"cf_3_col_name_2", "cf_3_col_val_2_key_1"}, + {"cf_3_col_name_3", "cf_3_col_val_3_key_1"}}; + + constexpr char key_2[] = "key_2"; + WideColumns key_2_columns_in_cf_1{ + {"cf_1_col_name_1", "cf_1_col_val_1_key_2"}}; + WideColumns key_2_columns_in_cf_2{ + {"cf_2_col_name_1", "cf_2_col_val_1_key_2"}, + {"cf_2_col_name_2", "cf_2_col_val_2_key_2"}}; + + constexpr char key_3[] = "key_3"; + WideColumns key_3_columns_in_cf_1{ + {"cf_1_col_name_1", "cf_1_col_val_1_key_3"}}; + WideColumns key_3_columns_in_cf_3{ + {"cf_3_col_name_1", "cf_3_col_val_1_key_3"}}; + + constexpr char key_4[] = "key_4"; + WideColumns key_4_columns_in_cf_0{ + {"cf_0_col_name_1", "cf_0_col_val_1_key_4"}}; + WideColumns key_4_columns_in_cf_2{ + {"cf_2_col_name_1", "cf_2_col_val_1_key_4"}}; + + AttributeGroups key_1_attribute_groups{ + AttributeGroup(handles_[2], key_1_columns_in_cf_2), + AttributeGroup(handles_[3], key_1_columns_in_cf_3)}; + AttributeGroups key_2_attribute_groups{ + AttributeGroup(handles_[1], key_2_columns_in_cf_1), + AttributeGroup(handles_[2], key_2_columns_in_cf_2)}; + AttributeGroups key_3_attribute_groups{ + AttributeGroup(handles_[1], key_3_columns_in_cf_1), + AttributeGroup(handles_[3], key_3_columns_in_cf_3)}; + AttributeGroups key_4_attribute_groups{ + AttributeGroup(handles_[0], key_4_columns_in_cf_0), + AttributeGroup(handles_[2], key_4_columns_in_cf_2)}; + + ASSERT_OK(db_->PutEntity(WriteOptions(), key_1, key_1_attribute_groups)); + ASSERT_OK(db_->PutEntity(WriteOptions(), key_2, key_2_attribute_groups)); + ASSERT_OK(db_->PutEntity(WriteOptions(), key_3, key_3_attribute_groups)); + ASSERT_OK(db_->PutEntity(WriteOptions(), key_4, key_4_attribute_groups)); + + // Test for iteration over CF default->1->2->3 + std::vector cfhs_order_0_1_2_3 = { + handles_[0], handles_[1], handles_[2], handles_[3]}; + std::vector expected_keys = {key_1, key_2, key_3, key_4}; + std::vector expected_attribute_groups = { + key_1_attribute_groups, key_2_attribute_groups, key_3_attribute_groups, + key_4_attribute_groups}; + verifyMultiCfIterator( + cfhs_order_0_1_2_3, expected_keys, std::nullopt /* expected_values */, + std::nullopt /* expected_wide_columns */, expected_attribute_groups); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc index eec1486c1ba..37b9d110e2a 100644 --- a/db/obsolete_files_test.cc +++ b/db/obsolete_files_test.cc @@ -7,10 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include - #include +#include #include #include #include @@ -68,7 +66,7 @@ class ObsoleteFilesTest : public DBTestBase { int log_cnt = 0; int sst_cnt = 0; int manifest_cnt = 0; - for (auto file : filenames) { + for (const auto& file : filenames) { uint64_t number; FileType type; if (ParseFileName(file, &number, &type)) { @@ -120,7 +118,7 @@ TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) { }); SyncPoint::GetInstance()->SetCallBack( "DBImpl::DeleteObsoleteFileImpl:AfterDeletion", [&](void* arg) { - Status* p_status = reinterpret_cast(arg); + Status* p_status = static_cast(arg); ASSERT_OK(*p_status); }); SyncPoint::GetInstance()->SetCallBack( @@ -165,7 +163,7 @@ TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) { {{"paranoid_file_checks", "true"}})); } } - ASSERT_OK(dbfull()->EnableFileDeletions(/*force=*/false)); + ASSERT_OK(dbfull()->EnableFileDeletions()); Close(); diff --git a/db/options_file_test.cc b/db/options_file_test.cc index c3adbeb642f..7e48f0cf38c 100644 --- a/db/options_file_test.cc +++ b/db/options_file_test.cc @@ -28,7 +28,7 @@ void UpdateOptionsFiles(DB* db, uint64_t number; FileType type; *options_files_count = 0; - for (auto filename : filenames) { + for (const auto& filename : filenames) { if (ParseFileName(filename, &number, &type) && type == kOptionsFile) { filename_history->insert(filename); (*options_files_count)++; @@ -44,16 +44,16 @@ void VerifyOptionsFileName( EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames)); uint64_t number; FileType type; - for (auto filename : filenames) { + for (const auto& filename : filenames) { if (ParseFileName(filename, &number, &type) && type == kOptionsFile) { current_filenames.insert(filename); } } - for (auto past_filename : past_filenames) { + for (const auto& past_filename : past_filenames) { if (current_filenames.find(past_filename) != current_filenames.end()) { continue; } - for (auto filename : current_filenames) { + for (const auto& filename : current_filenames) { ASSERT_GT(filename, past_filename); } } diff --git a/db/output_validator.cc b/db/output_validator.cc index e93e2d68c45..0c7109f3c2b 100644 --- a/db/output_validator.cc +++ b/db/output_validator.cc @@ -15,19 +15,15 @@ Status OutputValidator::Add(const Slice& key, const Slice& value) { paranoid_hash_ = NPHash64(key.data(), key.size(), paranoid_hash_); paranoid_hash_ = NPHash64(value.data(), value.size(), paranoid_hash_); } - if (enable_order_check_) { - TEST_SYNC_POINT_CALLBACK("OutputValidator::Add:order_check", - /*arg=*/nullptr); - if (key.size() < kNumInternalBytes) { - return Status::Corruption( - "Compaction tries to write a key without internal bytes."); - } - // prev_key_ starts with empty. - if (!prev_key_.empty() && icmp_.Compare(key, prev_key_) < 0) { - return Status::Corruption("Compaction sees out-of-order keys."); - } - prev_key_.assign(key.data(), key.size()); + if (key.size() < kNumInternalBytes) { + return Status::Corruption( + "Compaction tries to write a key without internal bytes."); } + // prev_key_ starts with empty. + if (!prev_key_.empty() && icmp_.Compare(key, prev_key_) < 0) { + return Status::Corruption("Compaction sees out-of-order keys."); + } + prev_key_.assign(key.data(), key.size()); return Status::OK(); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/output_validator.h b/db/output_validator.h index 40635f9c44c..1e7a8988edb 100644 --- a/db/output_validator.h +++ b/db/output_validator.h @@ -15,12 +15,10 @@ namespace ROCKSDB_NAMESPACE { // of all the key and value. class OutputValidator { public: - explicit OutputValidator(const InternalKeyComparator& icmp, - bool enable_order_check, bool enable_hash, + explicit OutputValidator(const InternalKeyComparator& icmp, bool enable_hash, uint64_t precalculated_hash = 0) : icmp_(icmp), paranoid_hash_(precalculated_hash), - enable_order_check_(enable_order_check), enable_hash_(enable_hash) {} // Add a key to the KV sequence, and return whether the key follows @@ -42,7 +40,6 @@ class OutputValidator { const InternalKeyComparator& icmp_; std::string prev_key_; uint64_t paranoid_hash_ = 0; - bool enable_order_check_; bool enable_hash_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 666ed32f0ec..c439c1ffedf 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -943,15 +943,17 @@ TEST_F(PerfContextTest, CPUTimer) { // monotonically increasing get_perf_context()->Reset(); - auto count = get_perf_context()->iter_seek_cpu_nanos; + uint64_t count = get_perf_context()->iter_seek_cpu_nanos; + uint64_t before_count = count; for (int i = 0; i < FLAGS_total_keys; ++i) { iter->Seek("k" + std::to_string(i)); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("v" + std::to_string(i), iter->value().ToString()); auto next_count = get_perf_context()->iter_seek_cpu_nanos; - ASSERT_GT(next_count, count); + ASSERT_GE(next_count, count); count = next_count; } + ASSERT_GT(count, before_count); // iterator creation/destruction; multiple iterators { @@ -1050,7 +1052,7 @@ TEST_F(PerfContextTest, MergeOperandCount) { std::vector statuses(num_keys); db->MultiGet(ReadOptions(), db->DefaultColumnFamily(), num_keys, - &key_slices[0], &results[0], &statuses[0]); + key_slices.data(), results.data(), statuses.data()); for (size_t i = 0; i < num_keys; ++i) { ASSERT_OK(statuses[i]); @@ -1068,7 +1070,7 @@ TEST_F(PerfContextTest, MergeOperandCount) { std::vector statuses(num_keys); db->MultiGetEntity(ReadOptions(), db->DefaultColumnFamily(), num_keys, - &key_slices[0], &results[0], &statuses[0]); + key_slices.data(), results.data(), statuses.data()); for (size_t i = 0; i < num_keys; ++i) { ASSERT_OK(statuses[i]); @@ -1119,6 +1121,23 @@ TEST_F(PerfContextTest, MergeOperandCount) { verify(); } +TEST_F(PerfContextTest, WriteMemtableTimePerfLevel) { + // Write and check time + ASSERT_OK(DestroyDB(kDbName, Options())); + std::shared_ptr db = OpenDb(); + + SetPerfLevel(PerfLevel::kEnableWait); + PerfContext* perf_ctx = get_perf_context(); + perf_ctx->Reset(); + ASSERT_OK(db->Put(WriteOptions(), "foo1", "bar")); + ASSERT_GT(perf_context.write_memtable_time, 0); + + SetPerfLevel(PerfLevel::kEnableCount); + perf_ctx->Reset(); + ASSERT_OK(db->Put(WriteOptions(), "foo0", "bar")); + ASSERT_EQ(perf_context.write_memtable_time, 0); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/periodic_task_scheduler.cc b/db/periodic_task_scheduler.cc index 1c4fc16b1c8..2f266529c57 100644 --- a/db/periodic_task_scheduler.cc +++ b/db/periodic_task_scheduler.cc @@ -76,7 +76,7 @@ Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type, task_type, TaskInfo{unique_id, repeat_period_seconds}); if (!result.second) { return Status::Aborted("Failed to add periodic task"); - }; + } return Status::OK(); } diff --git a/db/periodic_task_scheduler_test.cc b/db/periodic_task_scheduler_test.cc index c1205bcf612..baf74ed15e3 100644 --- a/db/periodic_task_scheduler_test.cc +++ b/db/periodic_task_scheduler_test.cc @@ -29,7 +29,7 @@ class PeriodicTaskSchedulerTest : public DBTestBase { SyncPoint::GetInstance()->SetCallBack( "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) { auto periodic_task_scheduler_ptr = - reinterpret_cast(arg); + static_cast(arg); periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get()); }); } diff --git a/db/pinned_iterators_manager.h b/db/pinned_iterators_manager.h index 0fcf231dad4..5caa1eacb2f 100644 --- a/db/pinned_iterators_manager.h +++ b/db/pinned_iterators_manager.h @@ -78,11 +78,11 @@ class PinnedIteratorsManager : public Cleanable { private: static void ReleaseInternalIterator(void* ptr) { - delete reinterpret_cast(ptr); + delete static_cast(ptr); } static void ReleaseArenaInternalIterator(void* ptr) { - reinterpret_cast(ptr)->~InternalIterator(); + static_cast(ptr)->~InternalIterator(); } bool pinning_enabled; diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index a6acb7b188c..11308624734 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -252,8 +252,6 @@ TEST_P(PlainTableDBTest, Empty) { ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); } -extern const uint64_t kPlainTableMagicNumber; - class TestPlainTableReader : public PlainTableReader { public: TestPlainTableReader( @@ -292,7 +290,7 @@ class TestPlainTableReader : public PlainTableReader { table_properties_ = std::move(props); } - ~TestPlainTableReader() override {} + ~TestPlainTableReader() override = default; private: bool MatchBloom(uint32_t hash) const override { @@ -307,7 +305,6 @@ class TestPlainTableReader : public PlainTableReader { bool* expect_bloom_not_match_; }; -extern const uint64_t kPlainTableMagicNumber; class TestPlainTableFactory : public PlainTableFactory { public: explicit TestPlainTableFactory(bool* expect_bloom_not_match, @@ -499,8 +496,8 @@ TEST_P(PlainTableDBTest, Flush) { ASSERT_GT(int_num, 0U); TablePropertiesCollection ptc; - ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables( - &ptc)); + ASSERT_OK( + static_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); ASSERT_EQ(1U, ptc.size()); auto row = ptc.begin(); auto tp = row->second; diff --git a/db/prefix_test.cc b/db/prefix_test.cc index bb6e6f7a670..b55956aa88e 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -89,8 +89,12 @@ class TestKeyComparator : public Comparator { const TestKey* key_a = &kkey_a; const TestKey* key_b = &kkey_b; if (key_a->prefix != key_b->prefix) { - if (key_a->prefix < key_b->prefix) return -1; - if (key_a->prefix > key_b->prefix) return 1; + if (key_a->prefix < key_b->prefix) { + return -1; + } + if (key_a->prefix > key_b->prefix) { + return 1; + } } else { EXPECT_TRUE(key_a->prefix == key_b->prefix); // note, both a and b could be prefix only @@ -99,8 +103,12 @@ class TestKeyComparator : public Comparator { EXPECT_TRUE( (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) || (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey))); - if (a.size() < b.size()) return -1; - if (a.size() > b.size()) return 1; + if (a.size() < b.size()) { + return -1; + } + if (a.size() > b.size()) { + return 1; + } } else { // both a and b are prefix if (a.size() == sizeof(uint64_t)) { @@ -109,9 +117,15 @@ class TestKeyComparator : public Comparator { // both a and b are whole key EXPECT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey)); - if (key_a->sorted < key_b->sorted) return -1; - if (key_a->sorted > key_b->sorted) return 1; - if (key_a->sorted == key_b->sorted) return 0; + if (key_a->sorted < key_b->sorted) { + return -1; + } + if (key_a->sorted > key_b->sorted) { + return 1; + } + if (key_a->sorted == key_b->sorted) { + return 0; + } } } return 0; @@ -892,4 +906,3 @@ int main(int argc, char** argv) { } #endif // GFLAGS - diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index 6e76f9c7258..f41521e1162 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -8,13 +8,11 @@ #include "db/compaction/compaction_iteration_stats.h" #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" -#include "db/range_del_aggregator.h" #include "db/range_tombstone_fragmenter.h" #include "db/version_edit.h" #include "rocksdb/comparator.h" #include "rocksdb/types.h" #include "table/internal_iterator.h" -#include "table/scoped_arena_iterator.h" #include "table/table_builder.h" #include "util/heap.h" #include "util/kv_map.h" diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index f7fa87af40d..f367a26787e 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -22,7 +22,6 @@ #include "rocksdb/comparator.h" #include "rocksdb/types.h" #include "table/internal_iterator.h" -#include "table/scoped_arena_iterator.h" #include "table/table_builder.h" #include "util/heap.h" #include "util/kv_map.h" diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc index 7e7cedeca48..565ee33908a 100644 --- a/db/range_tombstone_fragmenter.cc +++ b/db/range_tombstone_fragmenter.cc @@ -20,7 +20,8 @@ namespace ROCKSDB_NAMESPACE { FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( std::unique_ptr unfragmented_tombstones, const InternalKeyComparator& icmp, bool for_compaction, - const std::vector& snapshots) { + const std::vector& snapshots, + const bool tombstone_end_include_ts) { if (unfragmented_tombstones == nullptr) { return; } @@ -45,7 +46,12 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( last_start_key = pinned_last_start_key.Encode(); } } - if (is_sorted) { + + auto ucmp = icmp.user_comparator(); + assert(ucmp); + const size_t ts_sz = ucmp->timestamp_size(); + bool pad_min_ts_for_end = ts_sz > 0 && !tombstone_end_include_ts; + if (is_sorted && !pad_min_ts_for_end) { FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction, snapshots); return; @@ -63,8 +69,15 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( unfragmented_tombstones->value().size(); keys.emplace_back(unfragmented_tombstones->key().data(), unfragmented_tombstones->key().size()); - values.emplace_back(unfragmented_tombstones->value().data(), - unfragmented_tombstones->value().size()); + Slice value = unfragmented_tombstones->value(); + if (pad_min_ts_for_end) { + AppendKeyWithMinTimestamp(&values.emplace_back(), value, ts_sz); + } else { + values.emplace_back(value.data(), value.size()); + } + } + if (pad_min_ts_for_end) { + total_tombstone_payload_bytes_ += num_unfragmented_tombstones_ * ts_sz; } // VectorIterator implicitly sorts by key during construction. auto iter = std::make_unique(std::move(keys), diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h index ce631d495e6..8bdbf03be97 100644 --- a/db/range_tombstone_fragmenter.h +++ b/db/range_tombstone_fragmenter.h @@ -54,7 +54,8 @@ struct FragmentedRangeTombstoneList { FragmentedRangeTombstoneList( std::unique_ptr unfragmented_tombstones, const InternalKeyComparator& icmp, bool for_compaction = false, - const std::vector& snapshots = {}); + const std::vector& snapshots = {}, + const bool tombstone_end_include_ts = true); std::vector::const_iterator begin() const { return tombstones_.begin(); @@ -198,13 +199,15 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { pinned_seq_pos_ = tombstones_->seq_end(); } - RangeTombstone Tombstone() const { + RangeTombstone Tombstone(bool logical_strip_timestamp = false) const { assert(Valid()); if (icmp_->user_comparator()->timestamp_size()) { - return RangeTombstone(start_key(), end_key(), seq(), timestamp()); + return RangeTombstone(start_key(), end_key(), seq(), timestamp(), + logical_strip_timestamp); } return RangeTombstone(start_key(), end_key(), seq()); } + // Note that start_key() and end_key() are not guaranteed to have the // correct timestamp. User can call timestamp() to get the correct // timestamp(). diff --git a/db/repair.cc b/db/repair.cc index ef21f7ea611..4fe8b478863 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -81,7 +81,6 @@ #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/write_buffer_manager.h" -#include "table/scoped_arena_iterator.h" #include "table/unique_id_impl.h" #include "util/string_util.h" @@ -123,7 +122,7 @@ class Repairer { raw_table_cache_.get(), &wb_, &wc_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", db_session_id_, db_options.daily_offpeak_time_utc, - /*error_handler=*/nullptr), + /*error_handler=*/nullptr, /*read_only=*/false), next_file_number_(1), db_lock_(nullptr), closed_(false) { @@ -146,8 +145,10 @@ class Repairer { // Adds a column family to the VersionSet with cf_options_ and updates // manifest. Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) { - // TODO: plumb Env::IOActivity; + // TODO: plumb Env::IOActivity, Env::IOPriority; const ReadOptions read_options; + const WriteOptions write_options; + const auto* cf_opts = GetColumnFamilyOptions(cf_name); if (cf_opts == nullptr) { return Status::Corruption("Encountered unknown column family with name=" + @@ -170,9 +171,9 @@ class Repairer { Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir, nullptr); if (status.ok()) { - status = vset_.LogAndApply(cfd, mut_cf_opts, read_options, &edit, &mutex_, - db_dir.get(), false /* new_descriptor_log */, - cf_opts); + status = vset_.LogAndApply(cfd, mut_cf_opts, read_options, write_options, + &edit, &mutex_, db_dir.get(), + false /* new_descriptor_log */, cf_opts); } mutex_.Unlock(); return status; @@ -362,9 +363,6 @@ class Repairer { } }; - // TODO: plumb Env::IOActivity - const ReadOptions read_options; - // Open the log file std::string logname = LogFileName(wal_dir, log); const auto& fs = env_->GetFileSystem(); @@ -440,11 +438,12 @@ class Repairer { FileMetaData meta; meta.fd = FileDescriptor(next_file_number_++, 0, 0); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; ro.total_order_seek = true; Arena arena; - ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + ScopedArenaPtr iter( + mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); int64_t _current_time = 0; immutable_db_options_.clock->GetCurrentTime(&_current_time) .PermitUncheckedError(); // ignore error @@ -463,26 +462,29 @@ class Repairer { IOStatus io_s; CompressionOptions default_compression; + // TODO: plumb Env::IOActivity, Env::IOPriority + const ReadOptions read_options; + const WriteOptions write_option(Env::IO_HIGH); TableBuilderOptions tboptions( - *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), - cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), - kNoCompression, default_compression, cfd->GetID(), cfd->GetName(), - -1 /* level */, false /* is_bottommost */, - TableFileCreationReason::kRecovery, 0 /* oldest_key_time */, - 0 /* file_creation_time */, "DB Repairer" /* db_id */, db_session_id_, - 0 /*target_file_size*/, meta.fd.GetNumber()); + *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), read_options, + write_option, cfd->internal_comparator(), + cfd->internal_tbl_prop_coll_factories(), kNoCompression, + default_compression, cfd->GetID(), cfd->GetName(), -1 /* level */, + false /* is_bottommost */, TableFileCreationReason::kRecovery, + 0 /* oldest_key_time */, 0 /* file_creation_time */, + "DB Repairer" /* db_id */, db_session_id_, 0 /*target_file_size*/, + meta.fd.GetNumber()); SeqnoToTimeMapping empty_seqno_to_time_mapping; status = BuildTable( dbname_, /* versions */ nullptr, immutable_db_options_, tboptions, - file_options_, read_options, table_cache_.get(), iter.get(), + file_options_, table_cache_.get(), iter.get(), std::move(range_del_iters), &meta, nullptr /* blob_file_additions */, {}, kMaxSequenceNumber, kMaxSequenceNumber, snapshot_checker, false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s, nullptr /*IOTracer*/, BlobFileCreationReason::kRecovery, - empty_seqno_to_time_mapping, nullptr /* event_logger */, - 0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */, - write_hint); + nullptr /* seqno_to_time_mapping */, nullptr /* event_logger */, + 0 /* job_id */, nullptr /* table_properties */, write_hint); ROCKS_LOG_INFO(db_options_.info_log, "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", log, counter, meta.fd.GetNumber(), @@ -529,7 +531,7 @@ class Repairer { file_size); std::shared_ptr props; if (status.ok()) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; status = table_cache_->GetTableProperties( file_options_, read_options, icmp_, t->meta, &props, @@ -592,7 +594,7 @@ class Repairer { } } if (status.ok()) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ropts; ropts.total_order_seek = true; InternalIterator* iter = table_cache_->NewIterator( @@ -641,7 +643,7 @@ class Repairer { // an SST file is a full sorted run. This probably needs the extra logic // from compaction_job.cc around call to UpdateBoundariesForRange (to // handle range tombstones extendingg beyond range of other entries). - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ropts; std::unique_ptr r_iter; status = table_cache_->GetRangeTombstoneIterator( @@ -666,8 +668,10 @@ class Repairer { } Status AddTables() { - // TODO: plumb Env::IOActivity; + // TODO: plumb Env::IOActivity, Env::IOPriority; const ReadOptions read_options; + const WriteOptions write_options; + std::unordered_map> cf_id_to_tables; SequenceNumber max_sequence = 0; for (size_t i = 0; i < tables_.size(); i++) { @@ -755,8 +759,8 @@ class Repairer { nullptr); if (s.ok()) { s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - read_options, &edit, &mutex_, db_dir.get(), - false /* new_descriptor_log */); + read_options, write_options, &edit, &mutex_, + db_dir.get(), false /* new_descriptor_log */); } mutex_.Unlock(); } diff --git a/db/repair_test.cc b/db/repair_test.cc index e8cc40aab4a..8adc06f0c5f 100644 --- a/db/repair_test.cc +++ b/db/repair_test.cc @@ -378,8 +378,7 @@ TEST_P(RepairTestWithTimestamp, UnflushedSst) { ColumnFamilyOptions cf_options(options); std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); ASSERT_OK(DB::Open(options, dbname_, column_families, &handles_, &db_)); diff --git a/db/seqno_time_test.cc b/db/seqno_time_test.cc index 199c59c9bbb..f08fb3a29f3 100644 --- a/db/seqno_time_test.cc +++ b/db/seqno_time_test.cc @@ -33,10 +33,11 @@ class SeqnoTimeTest : public DBTestBase { void SetUp() override { mock_clock_->InstallTimedWaitFixCallback(); SyncPoint::GetInstance()->SetCallBack( - "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) { + "DBImpl::StartPeriodicTaskScheduler:Init", + [mock_clock = mock_clock_](void* arg) { auto periodic_task_scheduler_ptr = - reinterpret_cast(arg); - periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get()); + static_cast(arg); + periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock.get()); }); mock_clock_->SetCurrentTime(kMockStartTime); } @@ -78,7 +79,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { options.compaction_style = kCompactionStyleUniversal; options.preclude_last_level_data_seconds = 10000; options.env = mock_env_.get(); - options.bottommost_temperature = Temperature::kCold; + options.last_level_temperature = Temperature::kCold; options.num_levels = kNumLevels; DestroyAndReopen(options); @@ -180,7 +181,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) { Options options = CurrentOptions(); options.preclude_last_level_data_seconds = 10000; options.env = mock_env_.get(); - options.bottommost_temperature = Temperature::kCold; + options.last_level_temperature = Temperature::kCold; options.num_levels = kNumLevels; options.level_compaction_dynamic_level_bytes = true; // TODO(zjay): for level compaction, auto-compaction may stuck in deadloop, if @@ -330,25 +331,41 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { ASSERT_EQ(tables_props.size(), 1); auto it = tables_props.begin(); SeqnoToTimeMapping tp_mapping; - ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); - ASSERT_OK(tp_mapping.Sort()); + ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping)); + ASSERT_TRUE(tp_mapping.TEST_IsEnforced()); ASSERT_FALSE(tp_mapping.Empty()); auto seqs = tp_mapping.TEST_GetInternalMapping(); // about ~20 seqs->time entries, because the sample rate is 10000/100, and it // passes 2k time. Add (roughly) one for starting entry. - ASSERT_GE(seqs.size(), 20); - ASSERT_LE(seqs.size(), 22); - SequenceNumber seq_end = dbfull()->GetLatestSequenceNumber() + 1; - for (auto i = start_seq; i < seq_end; i++) { - // The result is within the range - ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i), - start_time + (i - start_seq) * 10 - 100); - ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i), - start_time + (i - start_seq) * 10); - } + // Revised: with automatic pre-population of mappings, some of these entries + // might be purged to keep the DB mapping within capacity. + EXPECT_GE(seqs.size(), 20 / 2); + EXPECT_LE(seqs.size(), 22); + + auto ValidateProximalSeqnos = [&](const char* name, double fuzz_ratio) { + SequenceNumber seq_end = dbfull()->GetLatestSequenceNumber() + 1; + uint64_t end_time = mock_clock_->NowSeconds(); + uint64_t seqno_fuzz = + static_cast((seq_end - start_seq) * fuzz_ratio + 0.999999); + for (unsigned time_pct = 0; time_pct <= 100; time_pct++) { + SCOPED_TRACE("name=" + std::string(name) + + " time_pct=" + std::to_string(time_pct)); + // Validate the important proximal API (GetProximalSeqnoBeforeTime) + uint64_t t = start_time + time_pct * (end_time - start_time) / 100; + auto seqno_reported = tp_mapping.GetProximalSeqnoBeforeTime(t); + auto seqno_expected = start_seq + time_pct * (seq_end - start_seq) / 100; + EXPECT_LE(seqno_reported, seqno_expected); + if (end_time - t < 10000) { + EXPECT_LE(seqno_expected, seqno_reported + seqno_fuzz); + } + } + start_seq = seq_end; + start_time = end_time; + }; + + ValidateProximalSeqnos("a", 0.1); + checked_file_nums.insert(it->second->orig_file_number); - start_seq = seq_end; - start_time = mock_clock_->NowSeconds(); // Write a key every 1 seconds for (int i = 0; i < 200; i++) { @@ -356,7 +373,7 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(1)); }); } - seq_end = dbfull()->GetLatestSequenceNumber() + 1; + ASSERT_OK(Flush()); tables_props.clear(); ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); @@ -371,21 +388,17 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { ASSERT_TRUE(it != tables_props.end()); tp_mapping.Clear(); - ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); - ASSERT_OK(tp_mapping.Sort()); + ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping)); + ASSERT_TRUE(tp_mapping.TEST_IsEnforced()); seqs = tp_mapping.TEST_GetInternalMapping(); // There only a few time sample ASSERT_GE(seqs.size(), 1); ASSERT_LE(seqs.size(), 3); - for (auto i = start_seq; i < seq_end; i++) { - ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i), - start_time + (i - start_seq) - 100); - ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i), - start_time + (i - start_seq)); - } + + // High fuzz ratio because of low number of samples + ValidateProximalSeqnos("b", 0.5); + checked_file_nums.insert(it->second->orig_file_number); - start_seq = seq_end; - start_time = mock_clock_->NowSeconds(); // Write a key every 200 seconds for (int i = 0; i < 200; i++) { @@ -393,7 +406,7 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(200)); }); } - seq_end = dbfull()->GetLatestSequenceNumber() + 1; + // seq_end = dbfull()->GetLatestSequenceNumber() + 1; ASSERT_OK(Flush()); tables_props.clear(); ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); @@ -408,24 +421,16 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { ASSERT_TRUE(it != tables_props.end()); tp_mapping.Clear(); - ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); - ASSERT_OK(tp_mapping.Sort()); + ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping)); + ASSERT_TRUE(tp_mapping.TEST_IsEnforced()); seqs = tp_mapping.TEST_GetInternalMapping(); - // The sequence number -> time entries should be maxed - ASSERT_GE(seqs.size(), 99); - ASSERT_LE(seqs.size(), 101); - for (auto i = start_seq; i < seq_end; i++) { - // aged out entries allowed to report time=0 - if ((seq_end - i) * 200 <= 10000) { - ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i), - start_time + (i - start_seq) * 200 - 100); - } - ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i), - start_time + (i - start_seq) * 200); - } + // For the preserved time span, only 10000/200=50 (+1) entries were recorded + ASSERT_GE(seqs.size(), 50); + ASSERT_LE(seqs.size(), 51); + + ValidateProximalSeqnos("c", 0.04); + checked_file_nums.insert(it->second->orig_file_number); - start_seq = seq_end; - start_time = mock_clock_->NowSeconds(); // Write a key every 100 seconds for (int i = 0; i < 200; i++) { @@ -433,7 +438,6 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); } - seq_end = dbfull()->GetLatestSequenceNumber() + 1; ASSERT_OK(Flush()); tables_props.clear(); ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); @@ -447,9 +451,11 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { } ASSERT_TRUE(it != tables_props.end()); tp_mapping.Clear(); - ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); - ASSERT_OK(tp_mapping.Sort()); + ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping)); + ASSERT_TRUE(tp_mapping.TEST_IsEnforced()); seqs = tp_mapping.TEST_GetInternalMapping(); + // For the preserved time span, max entries were recorded and + // preserved (10000/100=100 (+1)) ASSERT_GE(seqs.size(), 99); ASSERT_LE(seqs.size(), 101); @@ -474,21 +480,14 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { } ASSERT_TRUE(it != tables_props.end()); tp_mapping.Clear(); - ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); - ASSERT_OK(tp_mapping.Sort()); + ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping)); + ASSERT_TRUE(tp_mapping.TEST_IsEnforced()); seqs = tp_mapping.TEST_GetInternalMapping(); ASSERT_GE(seqs.size(), 99); ASSERT_LE(seqs.size(), 101); - for (auto i = start_seq; i < seq_end; i++) { - // aged out entries allowed to report time=0 - // FIXME: should be <= - if ((seq_end - i) * 100 < 10000) { - ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i), - start_time + (i - start_seq) * 100 - 100); - } - ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i), - start_time + (i - start_seq) * 100); - } + + ValidateProximalSeqnos("d", 0.02); + ASSERT_OK(db_->Close()); } @@ -545,8 +544,8 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { ASSERT_EQ(tables_props.size(), 1); it = tables_props.begin(); SeqnoToTimeMapping tp_mapping; - ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); - ASSERT_OK(tp_mapping.Sort()); + ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping)); + ASSERT_TRUE(tp_mapping.TEST_IsEnforced()); ASSERT_FALSE(tp_mapping.Empty()); auto seqs = tp_mapping.TEST_GetInternalMapping(); ASSERT_GE(seqs.size(), 1); @@ -565,7 +564,8 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { } seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping(); ASSERT_GE(seqs.size(), 1000 - 1); - ASSERT_LE(seqs.size(), 1000 + 1); + // Non-strict limit can exceed capacity by a reasonable fraction + ASSERT_LE(seqs.size(), 1000 * 9 / 8); ASSERT_OK(Flush(2)); tables_props.clear(); @@ -573,8 +573,8 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { ASSERT_EQ(tables_props.size(), 1); it = tables_props.begin(); tp_mapping.Clear(); - ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); - ASSERT_OK(tp_mapping.Sort()); + ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping)); + ASSERT_TRUE(tp_mapping.TEST_IsEnforced()); seqs = tp_mapping.TEST_GetInternalMapping(); // the max encoded entries is 100 ASSERT_GE(seqs.size(), 100 - 1); @@ -606,8 +606,8 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { ASSERT_EQ(tables_props.size(), 1); it = tables_props.begin(); tp_mapping.Clear(); - ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); - ASSERT_OK(tp_mapping.Sort()); + ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping)); + ASSERT_TRUE(tp_mapping.TEST_IsEnforced()); seqs = tp_mapping.TEST_GetInternalMapping(); ASSERT_GE(seqs.size(), 99); ASSERT_LE(seqs.size(), 101); @@ -721,8 +721,8 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) { for (const auto& props : tables_props) { ASSERT_FALSE(props.second->seqno_to_time_mapping.empty()); SeqnoToTimeMapping tp_mapping; - ASSERT_OK(tp_mapping.Add(props.second->seqno_to_time_mapping)); - ASSERT_OK(tp_mapping.Sort()); + ASSERT_OK(tp_mapping.DecodeFrom(props.second->seqno_to_time_mapping)); + ASSERT_TRUE(tp_mapping.TEST_IsEnforced()); ASSERT_FALSE(tp_mapping.Empty()); auto seqs = tp_mapping.TEST_GetInternalMapping(); // Add (roughly) one for starting entry. @@ -746,7 +746,8 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) { auto it = tables_props.begin(); SeqnoToTimeMapping tp_mapping; ASSERT_FALSE(it->second->seqno_to_time_mapping.empty()); - ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping)); + ASSERT_TRUE(tp_mapping.TEST_IsEnforced()); // compact to the last level CompactRangeOptions cro; @@ -773,7 +774,8 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) { it = tables_props.begin(); ASSERT_FALSE(it->second->seqno_to_time_mapping.empty()); - ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping)); + ASSERT_TRUE(tp_mapping.TEST_IsEnforced()); // make half of the data expired mock_clock_->MockSleepForSeconds(static_cast(8000)); @@ -929,7 +931,7 @@ TEST_P(SeqnoTimeTablePropTest, PrePopulateInDB) { DestroyAndReopen(track_options); // Ensure pre-population - constexpr auto kPrePopPairs = SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST; + constexpr auto kPrePopPairs = kMaxSeqnoTimePairsPerSST; sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); latest_seqno = db_->GetLatestSequenceNumber(); start_time = mock_clock_->NowSeconds(); @@ -970,7 +972,7 @@ TEST_P(SeqnoTimeTablePropTest, PrePopulateInDB) { sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); latest_seqno = db_->GetLatestSequenceNumber(); end_time = mock_clock_->NowSeconds(); - ASSERT_EQ(sttm.Size(), kPrePopPairs); + ASSERT_GE(sttm.Size(), kPrePopPairs); ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(end_time), latest_seqno); ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs / 2), kPrePopPairs / 2); @@ -1015,42 +1017,153 @@ TEST_P(SeqnoTimeTablePropTest, PrePopulateInDB) { } TEST_F(SeqnoTimeTest, MappingAppend) { - SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10); + using P = SeqnoToTimeMapping::SeqnoTimePair; + SeqnoToTimeMapping test; + test.SetMaxTimeSpan(100).SetCapacity(10); // ignore seqno == 0, as it may mean the seqno is zeroed out - ASSERT_FALSE(test.Append(0, 9)); + ASSERT_FALSE(test.Append(0, 100)); - ASSERT_TRUE(test.Append(3, 10)); + ASSERT_TRUE(test.Append(3, 200)); auto size = test.Size(); // normal add - ASSERT_TRUE(test.Append(10, 11)); + ASSERT_TRUE(test.Append(10, 300)); size++; ASSERT_EQ(size, test.Size()); - // Append unsorted - ASSERT_FALSE(test.Append(8, 12)); - ASSERT_EQ(size, test.Size()); - // Append with the same seqno, newer time is rejected because that makes // GetProximalSeqnoBeforeTime queries worse (see later test) - ASSERT_FALSE(test.Append(10, 12)); + ASSERT_FALSE(test.Append(10, 301)); ASSERT_EQ(size, test.Size()); - // older time will be ignored - ASSERT_FALSE(test.Append(10, 9)); + ASSERT_EQ(test.TEST_GetLastEntry(), P({10, 300})); + + // Same or new seqno with same or older time (as last successfully added) is + // accepted by replacing last entry (improves GetProximalSeqnoBeforeTime + // queries without blowing up size) + ASSERT_FALSE(test.Append(10, 299)); ASSERT_EQ(size, test.Size()); + ASSERT_EQ(test.TEST_GetLastEntry(), P({10, 299})); - // new seqno with old time will be ignored - ASSERT_FALSE(test.Append(12, 8)); + ASSERT_FALSE(test.Append(11, 299)); ASSERT_EQ(size, test.Size()); + ASSERT_EQ(test.TEST_GetLastEntry(), P({11, 299})); - // new seqno with same time is accepted by replacing last entry - // (improves GetProximalSeqnoBeforeTime queries without blowing up size) - ASSERT_TRUE(test.Append(12, 11)); + ASSERT_FALSE(test.Append(11, 250)); ASSERT_EQ(size, test.Size()); + ASSERT_EQ(test.TEST_GetLastEntry(), P({11, 250})); +} + +TEST_F(SeqnoTimeTest, CapacityLimits) { + using P = SeqnoToTimeMapping::SeqnoTimePair; + SeqnoToTimeMapping test; + + test.SetCapacity(3); + EXPECT_TRUE(test.Append(10, 300)); + EXPECT_TRUE(test.Append(20, 400)); + EXPECT_TRUE(test.Append(30, 500)); + EXPECT_TRUE(test.Append(40, 600)); + // Capacity 3 is small enough that the non-strict limit is + // equal to the strict limit. + EXPECT_EQ(3U, test.Size()); + EXPECT_EQ(test.TEST_GetLastEntry(), P({40, 600})); + + // Same for Capacity 2 + test.SetCapacity(2); + EXPECT_EQ(2U, test.Size()); + EXPECT_EQ(test.TEST_GetLastEntry(), P({40, 600})); + + EXPECT_TRUE(test.Append(50, 700)); + EXPECT_EQ(2U, test.Size()); + EXPECT_EQ(test.TEST_GetLastEntry(), P({50, 700})); + + // Capacity 1 is difficult to work with internally, so is + // coerced to 2. + test.SetCapacity(1); + EXPECT_EQ(2U, test.Size()); + EXPECT_EQ(test.TEST_GetLastEntry(), P({50, 700})); + + EXPECT_TRUE(test.Append(60, 800)); + EXPECT_EQ(2U, test.Size()); + EXPECT_EQ(test.TEST_GetLastEntry(), P({60, 800})); + + // Capacity 0 means throw everything away + test.SetCapacity(0); + EXPECT_EQ(0U, test.Size()); + + EXPECT_FALSE(test.Append(70, 900)); + EXPECT_EQ(0U, test.Size()); + + // Unlimited capacity + test.SetCapacity(UINT64_MAX); + for (unsigned i = 1; i <= 10101U; i++) { + EXPECT_TRUE(test.Append(i, 11U * i)); + } + EXPECT_EQ(10101U, test.Size()); +} + +TEST_F(SeqnoTimeTest, TimeSpanLimits) { + SeqnoToTimeMapping test; + + // Default: no limit + for (unsigned i = 1; i <= 63U; i++) { + EXPECT_TRUE(test.Append(1000 + i, uint64_t{1} << i)); + } + // None dropped. + EXPECT_EQ(63U, test.Size()); + + test.Clear(); + + // Explicit no limit + test.SetMaxTimeSpan(UINT64_MAX); + for (unsigned i = 1; i <= 63U; i++) { + EXPECT_TRUE(test.Append(1000 + i, uint64_t{1} << i)); + } + // None dropped. + EXPECT_EQ(63U, test.Size()); + + // We generally keep 2 entries as long as the configured max time span + // is non-zero + test.SetMaxTimeSpan(10); + EXPECT_EQ(2U, test.Size()); + + test.SetMaxTimeSpan(1); + EXPECT_EQ(2U, test.Size()); + + // But go down to 1 entry if the max time span is zero + test.SetMaxTimeSpan(0); + EXPECT_EQ(1U, test.Size()); + + EXPECT_TRUE(test.Append(2000, (uint64_t{1} << 63) + 42U)); + EXPECT_EQ(1U, test.Size()); + + test.Clear(); + + // Test more typical behavior. Note that one entry at or beyond the max span + // is kept. + test.SetMaxTimeSpan(100); + EXPECT_TRUE(test.Append(1001, 123)); + EXPECT_TRUE(test.Append(1002, 134)); + EXPECT_TRUE(test.Append(1003, 150)); + EXPECT_TRUE(test.Append(1004, 189)); + EXPECT_TRUE(test.Append(1005, 220)); + EXPECT_EQ(5U, test.Size()); + EXPECT_TRUE(test.Append(1006, 233)); + EXPECT_EQ(6U, test.Size()); + EXPECT_TRUE(test.Append(1007, 234)); + EXPECT_EQ(6U, test.Size()); + EXPECT_TRUE(test.Append(1008, 235)); + EXPECT_EQ(7U, test.Size()); + EXPECT_TRUE(test.Append(1009, 300)); + EXPECT_EQ(6U, test.Size()); + EXPECT_TRUE(test.Append(1010, 350)); + EXPECT_EQ(3U, test.Size()); + EXPECT_TRUE(test.Append(1011, 470)); + EXPECT_EQ(2U, test.Size()); } TEST_F(SeqnoTimeTest, ProximalFunctions) { - SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10); + SeqnoToTimeMapping test; + test.SetCapacity(10); EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1), kUnknownTimeBeforeAll); EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U), @@ -1081,6 +1194,7 @@ TEST_F(SeqnoTimeTest, ProximalFunctions) { // More samples EXPECT_TRUE(test.Append(20, 600)); EXPECT_TRUE(test.Append(30, 700)); + EXPECT_EQ(test.Size(), 3U); EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll); EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U); @@ -1140,8 +1254,9 @@ TEST_F(SeqnoTimeTest, ProximalFunctions) { // Burst of writes during a short time creates an opportunity // for better results from GetProximalSeqnoBeforeTime(), at the - // expense of GetProximalTimeBeforeSeqno(). - EXPECT_TRUE(test.Append(50, 900)); + // expense of GetProximalTimeBeforeSeqno(). False return indicates + // merge with previous entry. + EXPECT_FALSE(test.Append(50, 900)); // These are subject to later revision depending on priorities EXPECT_EQ(test.GetProximalTimeBeforeSeqno(49), 700U); @@ -1151,7 +1266,8 @@ TEST_F(SeqnoTimeTest, ProximalFunctions) { } TEST_F(SeqnoTimeTest, PrePopulate) { - SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10); + SeqnoToTimeMapping test; + test.SetMaxTimeSpan(100).SetCapacity(10); EXPECT_EQ(test.Size(), 0U); @@ -1194,14 +1310,102 @@ TEST_F(SeqnoTimeTest, PrePopulate) { } } -TEST_F(SeqnoTimeTest, TruncateOldEntries) { - constexpr uint64_t kMaxTimeDuration = 42; - SeqnoToTimeMapping test(kMaxTimeDuration, /*max_capacity=*/10); +TEST_F(SeqnoTimeTest, CopyFromSeqnoRange) { + SeqnoToTimeMapping test_from; + SeqnoToTimeMapping test_to; + + // With zero to draw from + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 0, 1000000); + EXPECT_EQ(test_to.Size(), 0U); + + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 100, 100); + EXPECT_EQ(test_to.Size(), 0U); + + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, kMaxSequenceNumber, 0); + EXPECT_EQ(test_to.Size(), 0U); + + // With one to draw from + EXPECT_TRUE(test_from.Append(10, 500)); + + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 0, 1000000); + EXPECT_EQ(test_to.Size(), 1U); + + // Includes one entry before range + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 100, 100); + EXPECT_EQ(test_to.Size(), 1U); + + // Includes one entry before range (even if somewhat nonsensical) + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, kMaxSequenceNumber, 0); + EXPECT_EQ(test_to.Size(), 1U); + + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 0, 9); + EXPECT_EQ(test_to.Size(), 0U); + + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 0, 10); + EXPECT_EQ(test_to.Size(), 1U); + + // With more to draw from + EXPECT_TRUE(test_from.Append(20, 600)); + EXPECT_TRUE(test_from.Append(30, 700)); + EXPECT_TRUE(test_from.Append(40, 800)); + EXPECT_TRUE(test_from.Append(50, 900)); + + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 0, 1000000); + EXPECT_EQ(test_to.Size(), 5U); + + // Includes one entry before range + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 100, 100); + EXPECT_EQ(test_to.Size(), 1U); + + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 19, 19); + EXPECT_EQ(test_to.Size(), 1U); + + // Includes one entry before range (even if somewhat nonsensical) + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, kMaxSequenceNumber, 0); + EXPECT_EQ(test_to.Size(), 1U); + + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 0, 9); + EXPECT_EQ(test_to.Size(), 0U); + + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 0, 10); + EXPECT_EQ(test_to.Size(), 1U); + + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 20, 20); + EXPECT_EQ(test_to.Size(), 2U); + + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 20, 29); + EXPECT_EQ(test_to.Size(), 2U); + + test_to.Clear(); + test_to.CopyFromSeqnoRange(test_from, 20, 30); + EXPECT_EQ(test_to.Size(), 3U); +} + +TEST_F(SeqnoTimeTest, EnforceWithNow) { + constexpr uint64_t kMaxTimeSpan = 420; + SeqnoToTimeMapping test; + test.SetMaxTimeSpan(kMaxTimeSpan).SetCapacity(10); EXPECT_EQ(test.Size(), 0U); // Safe on empty mapping - test.TruncateOldEntries(500); + test.Enforce(/*now=*/500); EXPECT_EQ(test.Size(), 0U); @@ -1223,13 +1427,13 @@ TEST_F(SeqnoTimeTest, TruncateOldEntries) { // etc. // Must keep first entry - test.TruncateOldEntries(500 + kMaxTimeDuration); + test.Enforce(/*now=*/500 + kMaxTimeSpan); EXPECT_EQ(test.Size(), 5U); - test.TruncateOldEntries(599 + kMaxTimeDuration); + test.Enforce(/*now=*/599 + kMaxTimeSpan); EXPECT_EQ(test.Size(), 5U); // Purges first entry - test.TruncateOldEntries(600 + kMaxTimeDuration); + test.Enforce(/*now=*/600 + kMaxTimeSpan); EXPECT_EQ(test.Size(), 4U); EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), kUnknownSeqnoBeforeAll); @@ -1239,20 +1443,20 @@ TEST_F(SeqnoTimeTest, TruncateOldEntries) { EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U); // No effect - test.TruncateOldEntries(600 + kMaxTimeDuration); + test.Enforce(/*now=*/600 + kMaxTimeSpan); EXPECT_EQ(test.Size(), 4U); - test.TruncateOldEntries(699 + kMaxTimeDuration); + test.Enforce(/*now=*/699 + kMaxTimeSpan); EXPECT_EQ(test.Size(), 4U); // Purges next two - test.TruncateOldEntries(899 + kMaxTimeDuration); + test.Enforce(/*now=*/899 + kMaxTimeSpan); EXPECT_EQ(test.Size(), 2U); EXPECT_EQ(test.GetProximalSeqnoBeforeTime(799), kUnknownSeqnoBeforeAll); EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 40U); // Always keep last entry, to have a non-trivial seqno bound - test.TruncateOldEntries(10000000); + test.Enforce(/*now=*/10000000); EXPECT_EQ(test.Size(), 1U); EXPECT_EQ(test.GetProximalSeqnoBeforeTime(10000000), 50U); @@ -1262,67 +1466,114 @@ TEST_F(SeqnoTimeTest, Sort) { SeqnoToTimeMapping test; // single entry - test.Add(10, 11); - ASSERT_OK(test.Sort()); + test.AddUnenforced(10, 11); + test.Enforce(); ASSERT_EQ(test.Size(), 1); - // duplicate, should be removed by sort - test.Add(10, 11); - // same seqno, but older time, should be removed - test.Add(10, 9); + // duplicate is ignored + test.AddUnenforced(10, 11); + test.Enforce(); + ASSERT_EQ(test.Size(), 1); - // unuseful ones, should be removed by sort - test.Add(11, 9); - test.Add(9, 8); + // add some revised mappings for that seqno + test.AddUnenforced(10, 10); + test.AddUnenforced(10, 12); - // Good ones - test.Add(1, 10); - test.Add(100, 100); + // We currently favor GetProximalSeqnoBeforeTime over + // GetProximalTimeBeforeSeqno by keeping the older time. + test.Enforce(); + auto seqs = test.TEST_GetInternalMapping(); + std::deque expected; + expected.emplace_back(10, 10); + ASSERT_EQ(expected, seqs); - ASSERT_OK(test.Sort()); + // add an inconsistent / unuseful mapping + test.AddUnenforced(9, 11); + test.Enforce(); + seqs = test.TEST_GetInternalMapping(); + ASSERT_EQ(expected, seqs); - auto seqs = test.TEST_GetInternalMapping(); + // And a mapping that is considered more useful (for + // GetProximalSeqnoBeforeTime) and thus replaces that one + test.AddUnenforced(11, 9); + test.Enforce(); + seqs = test.TEST_GetInternalMapping(); + expected.clear(); + expected.emplace_back(11, 9); + ASSERT_EQ(expected, seqs); - std::deque expected; - expected.emplace_back(1, 10); - expected.emplace_back(10, 11); + // Add more good, non-mergable entries + test.AddUnenforced(1, 5); + test.AddUnenforced(100, 100); + test.Enforce(); + seqs = test.TEST_GetInternalMapping(); + expected.clear(); + expected.emplace_back(1, 5); + expected.emplace_back(11, 9); expected.emplace_back(100, 100); - ASSERT_EQ(expected, seqs); } TEST_F(SeqnoTimeTest, EncodeDecodeBasic) { - SeqnoToTimeMapping test(0, 1000); + constexpr uint32_t kOriginalSamples = 1000; + SeqnoToTimeMapping test; + test.SetCapacity(kOriginalSamples); std::string output; - test.Encode(output, 0, 1000, 100); + test.EncodeTo(output); ASSERT_TRUE(output.empty()); - for (int i = 1; i <= 1000; i++) { - ASSERT_TRUE(test.Append(i, i * 10)); - } - test.Encode(output, 0, 1000, 100); + ASSERT_OK(test.DecodeFrom(output)); + ASSERT_EQ(test.Size(), 0U); + Random rnd(123); + for (uint32_t i = 1; i <= kOriginalSamples; i++) { + ASSERT_TRUE(test.Append(i, i * 10 + rnd.Uniform(10))); + } + output.clear(); + test.EncodeTo(output); ASSERT_FALSE(output.empty()); SeqnoToTimeMapping decoded; - ASSERT_OK(decoded.Add(output)); - ASSERT_OK(decoded.Sort()); - ASSERT_EQ(decoded.Size(), SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST); - ASSERT_EQ(test.Size(), 1000); - - for (SequenceNumber seq = 0; seq <= 1000; seq++) { - // test has the more accurate time mapping, encode only pick - // kMaxSeqnoTimePairsPerSST number of entries, which is less accurate - uint64_t target_time = test.GetProximalTimeBeforeSeqno(seq); - ASSERT_GE(decoded.GetProximalTimeBeforeSeqno(seq), - target_time < 200 ? 0 : target_time - 200); - ASSERT_LE(decoded.GetProximalTimeBeforeSeqno(seq), target_time); + ASSERT_OK(decoded.DecodeFrom(output)); + ASSERT_TRUE(decoded.TEST_IsEnforced()); + ASSERT_EQ(test.Size(), decoded.Size()); + ASSERT_EQ(test.TEST_GetInternalMapping(), decoded.TEST_GetInternalMapping()); + + // Encode a reduced set of mappings + constexpr uint32_t kReducedSize = 51U; + output.clear(); + SeqnoToTimeMapping(test).SetCapacity(kReducedSize).EncodeTo(output); + + decoded.Clear(); + ASSERT_OK(decoded.DecodeFrom(output)); + ASSERT_TRUE(decoded.TEST_IsEnforced()); + ASSERT_EQ(decoded.Size(), kReducedSize); + + for (uint64_t t = 1; t <= kOriginalSamples * 11; t += 1 + t / 100) { + SCOPED_TRACE("t=" + std::to_string(t)); + // `test` has the more accurate time mapping, but the reduced set should + // nicely span and approximate the whole range + auto orig_s = test.GetProximalSeqnoBeforeTime(t); + auto approx_s = decoded.GetProximalSeqnoBeforeTime(t); + // The oldest entry should be preserved exactly + ASSERT_EQ(orig_s == kUnknownSeqnoBeforeAll, + approx_s == kUnknownSeqnoBeforeAll); + // The newest entry should be preserved exactly + ASSERT_EQ(orig_s == kOriginalSamples, approx_s == kOriginalSamples); + + // Approximate seqno before time should err toward older seqno to avoid + // classifying data as old too early, but should be within a reasonable + // bound. + constexpr uint32_t kSeqnoFuzz = kOriginalSamples * 3 / 2 / kReducedSize; + EXPECT_GE(approx_s + kSeqnoFuzz, orig_s); + EXPECT_GE(orig_s, approx_s); } } -TEST_F(SeqnoTimeTest, EncodeDecodePerferNewTime) { - SeqnoToTimeMapping test(0, 10); +TEST_F(SeqnoTimeTest, EncodeDecodeMinimizeTimeGaps) { + SeqnoToTimeMapping test; + test.SetCapacity(10); test.Append(1, 10); test.Append(5, 17); @@ -1330,45 +1581,66 @@ TEST_F(SeqnoTimeTest, EncodeDecodePerferNewTime) { test.Append(8, 30); std::string output; - test.Encode(output, 1, 10, 0, 3); + SeqnoToTimeMapping(test).SetCapacity(3).EncodeTo(output); SeqnoToTimeMapping decoded; - ASSERT_OK(decoded.Add(output)); - ASSERT_OK(decoded.Sort()); + ASSERT_OK(decoded.DecodeFrom(output)); + ASSERT_TRUE(decoded.TEST_IsEnforced()); ASSERT_EQ(decoded.Size(), 3); auto seqs = decoded.TEST_GetInternalMapping(); std::deque expected; expected.emplace_back(1, 10); - expected.emplace_back(6, 25); + expected.emplace_back(5, 17); expected.emplace_back(8, 30); ASSERT_EQ(expected, seqs); // Add a few large time number test.Append(10, 100); test.Append(13, 200); - test.Append(16, 300); + test.Append(40, 250); + test.Append(70, 300); output.clear(); - test.Encode(output, 1, 20, 0, 4); + SeqnoToTimeMapping(test).SetCapacity(4).EncodeTo(output); decoded.Clear(); - ASSERT_OK(decoded.Add(output)); - ASSERT_OK(decoded.Sort()); + ASSERT_OK(decoded.DecodeFrom(output)); + ASSERT_TRUE(decoded.TEST_IsEnforced()); ASSERT_EQ(decoded.Size(), 4); expected.clear(); + // Except for beginning and end, entries are removed that minimize the + // remaining time gaps, regardless of seqno gaps. expected.emplace_back(1, 10); - // entry #6, #8 are skipped as they are too close to #1. - // entry #100 is also within skip range, but if it's skipped, there not enough - // number to fill 4 entries, so select it. expected.emplace_back(10, 100); expected.emplace_back(13, 200); - expected.emplace_back(16, 300); + expected.emplace_back(70, 300); seqs = decoded.TEST_GetInternalMapping(); ASSERT_EQ(expected, seqs); } +TEST(PackValueAndSeqnoTest, Basic) { + std::string packed_value_buf; + Slice packed_value_slice = + PackValueAndWriteTime("foo", 30u, &packed_value_buf); + auto [unpacked_value, write_time] = + ParsePackedValueWithWriteTime(packed_value_slice); + ASSERT_EQ(unpacked_value, "foo"); + ASSERT_EQ(write_time, 30u); + ASSERT_EQ(ParsePackedValueForValue(packed_value_slice), "foo"); +} + +TEST(PackValueAndWriteTimeTest, Basic) { + std::string packed_value_buf; + Slice packed_value_slice = PackValueAndSeqno("foo", 30u, &packed_value_buf); + auto [unpacked_value, write_time] = + ParsePackedValueWithSeqno(packed_value_slice); + ASSERT_EQ(unpacked_value, "foo"); + ASSERT_EQ(write_time, 30u); + ASSERT_EQ(ParsePackedValueForValue(packed_value_slice), "foo"); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/seqno_to_time_mapping.cc b/db/seqno_to_time_mapping.cc index 97a3e987986..63555701968 100644 --- a/db/seqno_to_time_mapping.cc +++ b/db/seqno_to_time_mapping.cc @@ -6,6 +6,14 @@ #include "db/seqno_to_time_mapping.h" +#include +#include +#include +#include +#include +#include +#include + #include "db/version_edit.h" #include "util/string_util.h" @@ -13,25 +21,28 @@ namespace ROCKSDB_NAMESPACE { SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterTime( uint64_t time) const { + assert(enforced_); return std::upper_bound(pairs_.cbegin(), pairs_.cend(), SeqnoTimePair{0, time}, SeqnoTimePair::TimeLess); } SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterEqSeqno( SequenceNumber seqno) const { + assert(enforced_); return std::lower_bound(pairs_.cbegin(), pairs_.cend(), SeqnoTimePair{seqno, 0}, SeqnoTimePair::SeqnoLess); } SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterSeqno( SequenceNumber seqno) const { + assert(enforced_); return std::upper_bound(pairs_.cbegin(), pairs_.cend(), SeqnoTimePair{seqno, 0}, SeqnoTimePair::SeqnoLess); } uint64_t SeqnoToTimeMapping::GetProximalTimeBeforeSeqno( SequenceNumber seqno) const { - assert(is_sorted_); + assert(enforced_); // Find the last entry with a seqno strictly less than the given seqno. // First, find the first entry >= the given seqno (or end) auto it = FindGreaterEqSeqno(seqno); @@ -43,43 +54,9 @@ uint64_t SeqnoToTimeMapping::GetProximalTimeBeforeSeqno( return it->time; } -void SeqnoToTimeMapping::Add(SequenceNumber seqno, uint64_t time) { - if (seqno == 0) { - return; - } - is_sorted_ = false; - pairs_.emplace_back(seqno, time); -} - -void SeqnoToTimeMapping::TruncateOldEntries(const uint64_t now) { - assert(is_sorted_); - - if (max_time_duration_ == 0) { - // No cutoff time - return; - } - - if (now < max_time_duration_) { - // Would under-flow - return; - } - - const uint64_t cut_off_time = now - max_time_duration_; - assert(cut_off_time <= now); // no under/overflow - - auto it = FindGreaterTime(cut_off_time); - if (it == pairs_.cbegin()) { - return; - } - // Move back one, to the entry that would be used to return a good seqno from - // GetProximalSeqnoBeforeTime(cut_off_time) - --it; - // Remove everything strictly before that entry - pairs_.erase(pairs_.cbegin(), std::move(it)); -} - -SequenceNumber SeqnoToTimeMapping::GetProximalSeqnoBeforeTime(uint64_t time) { - assert(is_sorted_); +SequenceNumber SeqnoToTimeMapping::GetProximalSeqnoBeforeTime( + uint64_t time) const { + assert(enforced_); // Find the last entry with a time <= the given time. // First, find the first entry > the given time (or end). @@ -92,130 +69,312 @@ SequenceNumber SeqnoToTimeMapping::GetProximalSeqnoBeforeTime(uint64_t time) { return it->seqno; } -// The encoded format is: -// [num_of_entries][[seqno][time],[seqno][time],...] -// ^ ^ -// var_int delta_encoded (var_int) -void SeqnoToTimeMapping::Encode(std::string& dest, const SequenceNumber start, - const SequenceNumber end, const uint64_t now, - const uint64_t output_size) const { - assert(is_sorted_); - if (start > end) { - // It could happen when the SST file is empty, the initial value of min - // sequence number is kMaxSequenceNumber and max is 0. - // The empty output file will be removed in the final step of compaction. +void SeqnoToTimeMapping::EnforceMaxTimeSpan(uint64_t now) { + assert(enforced_); // at least sorted + uint64_t cutoff_time; + if (pairs_.size() <= 1) { return; } - - auto start_it = FindGreaterSeqno(start); - if (start_it != pairs_.begin()) { - start_it--; + if (now > 0) { + if (now < max_time_span_) { + // Nothing eligible to prune / avoid underflow + return; + } + cutoff_time = now - max_time_span_; + } else { + const auto& last = pairs_.back(); + if (last.time < max_time_span_) { + // Nothing eligible to prune / avoid underflow + return; + } + cutoff_time = last.time - max_time_span_; + } + // Keep one entry <= cutoff_time + while (pairs_.size() >= 2 && pairs_[0].time <= cutoff_time && + pairs_[1].time <= cutoff_time) { + pairs_.pop_front(); } +} - auto end_it = FindGreaterSeqno(end); - if (end_it == pairs_.begin()) { +void SeqnoToTimeMapping::EnforceCapacity(bool strict) { + assert(enforced_); // at least sorted + uint64_t strict_cap = capacity_; + if (strict_cap == 0) { + pairs_.clear(); return; } - if (start_it >= end_it) { + // Treat cap of 1 as 2 to work with the below algorithm (etc.) + if (strict_cap == 1) { + strict_cap = 2; + } + // When !strict, allow being over nominal capacity by a modest fraction. + uint64_t effective_cap = strict_cap + (strict ? 0 : strict_cap / 8); + if (effective_cap < strict_cap) { + // Correct overflow + effective_cap = UINT64_MAX; + } + if (pairs_.size() <= effective_cap) { return; } + // The below algorithm expects at least one removal candidate between first + // and last. + assert(pairs_.size() >= 3); + size_t to_remove_count = pairs_.size() - strict_cap; + + struct RemovalCandidate { + uint64_t new_time_gap; + std::deque::iterator it; + RemovalCandidate(uint64_t _new_time_gap, + std::deque::iterator _it) + : new_time_gap(_new_time_gap), it(_it) {} + bool operator>(const RemovalCandidate& other) const { + if (new_time_gap == other.new_time_gap) { + // If same gap, treat the newer entry as less attractive + // for removal (like larger gap) + return it->seqno > other.it->seqno; + } + return new_time_gap > other.new_time_gap; + } + }; + + // A priority queue of best removal candidates (smallest time gap remaining + // after removal) + using RC = RemovalCandidate; + using PQ = std::priority_queue, std::greater>; + PQ pq; + + // Add all the candidates (not including first and last) + { + auto it = pairs_.begin(); + assert(it->time != kUnknownTimeBeforeAll); + uint64_t prev_prev_time = it->time; + ++it; + assert(it->time != kUnknownTimeBeforeAll); + auto prev_it = it; + ++it; + while (it != pairs_.end()) { + assert(it->time != kUnknownTimeBeforeAll); + uint64_t gap = it->time - prev_prev_time; + pq.emplace(gap, prev_it); + prev_prev_time = prev_it->time; + prev_it = it; + ++it; + } + } - // truncate old entries that are not needed - if (max_time_duration_ > 0) { - const uint64_t cut_off_time = - now > max_time_duration_ ? now - max_time_duration_ : 0; - while (start_it < end_it && start_it->time < cut_off_time) { - start_it++; + // Greedily remove the best candidate, iteratively + while (to_remove_count > 0) { + assert(!pq.empty()); + // Remove the candidate with smallest gap + auto rc = pq.top(); + pq.pop(); + + // NOTE: priority_queue does not support updating an existing element, + // but we can work around that because the gap tracked in pq is only + // going to be better than actuality, and we can detect and adjust + // when a better-than-actual gap is found. + + // Determine actual time gap if this entry is removed (zero entries are + // marked for deletion) + auto it = rc.it + 1; + uint64_t after_time = it->time; + while (after_time == kUnknownTimeBeforeAll) { + assert(it != pairs_.end()); + ++it; + after_time = it->time; + } + it = rc.it - 1; + uint64_t before_time = it->time; + while (before_time == kUnknownTimeBeforeAll) { + assert(it != pairs_.begin()); + --it; + before_time = it->time; + } + // Check whether the gap is still valid (or needs to be recomputed) + if (rc.new_time_gap == after_time - before_time) { + // Mark the entry as removed + rc.it->time = kUnknownTimeBeforeAll; + --to_remove_count; + } else { + // Insert a replacement up-to-date removal candidate + pq.emplace(after_time - before_time, rc.it); } } - // to include the first element - if (start_it != pairs_.begin()) { - start_it--; - } - - // If there are more data than needed, pick the entries for encoding. - // It's not the most optimized algorithm for selecting the best representative - // entries over the time. - // It starts from the beginning and makes sure the distance is larger than - // `(end - start) / size` before selecting the number. For example, for the - // following list, pick 3 entries (it will pick seqno #1, #6, #8): - // 1 -> 10 - // 5 -> 17 - // 6 -> 25 - // 8 -> 30 - // first, it always picks the first one, then there are 2 num_entries_to_fill - // and the time difference between current one vs. the last one is - // (30 - 10) = 20. 20/2 = 10. So it will skip until 10+10 = 20. => it skips - // #5 and pick #6. - // But the most optimized solution is picking #1 #5 #8, as it will be more - // evenly distributed for time. Anyway the following algorithm is simple and - // may over-select new data, which is good. We do want more accurate time - // information for recent data. - std::deque output_copy; - if (std::distance(start_it, end_it) > static_cast(output_size)) { - int64_t num_entries_to_fill = static_cast(output_size); - auto last_it = end_it; - last_it--; - uint64_t end_time = last_it->time; - uint64_t skip_until_time = 0; - for (auto it = start_it; it < end_it; it++) { - // skip if it's not reach the skip_until_time yet - if (std::distance(it, end_it) > num_entries_to_fill && - it->time < skip_until_time) { - continue; + + // Collapse away entries marked for deletion + auto from_it = pairs_.begin(); + auto to_it = from_it; + + for (; from_it != pairs_.end(); ++from_it) { + if (from_it->time != kUnknownTimeBeforeAll) { + if (from_it != to_it) { + *to_it = *from_it; } - output_copy.push_back(*it); - num_entries_to_fill--; - if (std::distance(it, end_it) > num_entries_to_fill && - num_entries_to_fill > 0) { - // If there are more entries than we need, re-calculate the - // skip_until_time, which means skip until that time - skip_until_time = - it->time + ((end_time - it->time) / num_entries_to_fill); + ++to_it; + } + } + + // Erase slots freed up + pairs_.erase(to_it, pairs_.end()); + assert(pairs_.size() == strict_cap); +} + +bool SeqnoToTimeMapping::SeqnoTimePair::Merge(const SeqnoTimePair& other) { + assert(seqno <= other.seqno); + if (seqno == other.seqno) { + // Favoring GetProximalSeqnoBeforeTime over GetProximalTimeBeforeSeqno + // by keeping the older time. For example, consider nothing has been + // written to the DB in some time. + time = std::min(time, other.time); + return true; + } else if (time == other.time) { + // Favoring GetProximalSeqnoBeforeTime over GetProximalTimeBeforeSeqno + // by keeping the newer seqno. For example, when a burst of writes ages + // out, we want the cutoff to be the newest seqno from that burst. + seqno = std::max(seqno, other.seqno); + return true; + } else if (time > other.time) { + assert(seqno < other.seqno); + // Need to resolve an inconsistency (clock drift? very rough time?). + // Given the direction that entries are supposed to err, trust the earlier + // time entry as more reliable, and this choice ensures we don't + // accidentally throw out an entry within our time span. + *this = other; + return true; + } else { + // Not merged + return false; + } +} + +void SeqnoToTimeMapping::SortAndMerge() { + assert(!enforced_); + if (!pairs_.empty()) { + std::sort(pairs_.begin(), pairs_.end()); + + auto from_it = pairs_.begin(); + auto to_it = from_it; + for (++from_it; from_it != pairs_.end(); ++from_it) { + if (to_it->Merge(*from_it)) { + // Merged with last entry + } else { + // Copy into next entry + *++to_it = *from_it; } } + // Erase slots freed up from merging + pairs_.erase(to_it + 1, pairs_.end()); + } + // Mark as "at least sorted" + enforced_ = true; +} + +SeqnoToTimeMapping& SeqnoToTimeMapping::SetMaxTimeSpan(uint64_t max_time_span) { + max_time_span_ = max_time_span; + if (enforced_) { + EnforceMaxTimeSpan(); + } + return *this; +} + +SeqnoToTimeMapping& SeqnoToTimeMapping::SetCapacity(uint64_t capacity) { + capacity_ = capacity; + if (enforced_) { + EnforceCapacity(/*strict=*/true); + } + return *this; +} + +SeqnoToTimeMapping& SeqnoToTimeMapping::Enforce(uint64_t now) { + if (!enforced_) { + SortAndMerge(); + assert(enforced_); + EnforceMaxTimeSpan(now); + } else if (now > 0) { + EnforceMaxTimeSpan(now); + } + EnforceCapacity(/*strict=*/true); + return *this; +} - // Make sure all entries are filled - assert(num_entries_to_fill == 0); - start_it = output_copy.begin(); - end_it = output_copy.end(); +void SeqnoToTimeMapping::AddUnenforced(SequenceNumber seqno, uint64_t time) { + if (seqno == 0) { + return; } + enforced_ = false; + pairs_.emplace_back(seqno, time); +} - // Delta encode the data - uint64_t size = std::distance(start_it, end_it); - PutVarint64(&dest, size); +// The encoded format is: +// [num_of_entries][[seqno][time],[seqno][time],...] +// ^ ^ +// var_int delta_encoded (var_int) +// Except empty string is used for empty mapping. This means the encoding +// doesn't fully form a prefix code, but that is OK for applications like +// TableProperties. +void SeqnoToTimeMapping::EncodeTo(std::string& dest) const { + assert(enforced_); + // Can use empty string for empty mapping + if (pairs_.empty()) { + return; + } + // Encode number of entries + PutVarint64(&dest, pairs_.size()); SeqnoTimePair base; - for (auto it = start_it; it < end_it; it++) { - assert(base < *it); - SeqnoTimePair val = it->ComputeDelta(base); - base = *it; + for (auto& cur : pairs_) { + assert(base < cur); + // Delta encode each entry + SeqnoTimePair val = cur.ComputeDelta(base); + base = cur; val.Encode(dest); } } -Status SeqnoToTimeMapping::Add(const std::string& pairs_str) { - Slice input(pairs_str); +namespace { +Status DecodeImpl(Slice& input, + std::deque& pairs) { if (input.empty()) { return Status::OK(); } - uint64_t size; - if (!GetVarint64(&input, &size)) { + uint64_t count; + if (!GetVarint64(&input, &count)) { return Status::Corruption("Invalid sequence number time size"); } - is_sorted_ = false; - SeqnoTimePair base; - for (uint64_t i = 0; i < size; i++) { - SeqnoTimePair val; + + SeqnoToTimeMapping::SeqnoTimePair base; + for (uint64_t i = 0; i < count; i++) { + SeqnoToTimeMapping::SeqnoTimePair val; Status s = val.Decode(input); if (!s.ok()) { return s; } val.ApplyDelta(base); - pairs_.emplace_back(val); + pairs.emplace_back(val); base = val; } + + if (!input.empty()) { + return Status::Corruption( + "Extra bytes at end of sequence number time mapping"); + } return Status::OK(); } +} // namespace + +Status SeqnoToTimeMapping::DecodeFrom(const std::string& pairs_str) { + size_t orig_size = pairs_.size(); + + Slice input(pairs_str); + Status s = DecodeImpl(input, pairs_); + if (!s.ok()) { + // Roll back in case of corrupted data + pairs_.resize(orig_size); + } else if (orig_size > 0 || max_time_span_ < UINT64_MAX || + capacity_ < UINT64_MAX) { + enforced_ = false; + } + return s; +} void SeqnoToTimeMapping::SeqnoTimePair::Encode(std::string& dest) const { PutVarint64Varint64(&dest, seqno, time); @@ -231,38 +390,74 @@ Status SeqnoToTimeMapping::SeqnoTimePair::Decode(Slice& input) { return Status::OK(); } -bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) { - assert(is_sorted_); +void SeqnoToTimeMapping::CopyFromSeqnoRange(const SeqnoToTimeMapping& src, + SequenceNumber from_seqno, + SequenceNumber to_seqno) { + bool orig_empty = Empty(); + auto src_it = src.FindGreaterEqSeqno(from_seqno); + // Allow nonsensical ranges like [1000, 0] which might show up e.g. for + // an SST file with no entries. + auto src_it_end = + to_seqno < from_seqno ? src_it : src.FindGreaterSeqno(to_seqno); + // To best answer GetProximalTimeBeforeSeqno(from_seqno) we need an entry + // with a seqno before that (if available) + if (src_it != src.pairs_.begin()) { + --src_it; + } + assert(src_it <= src_it_end); + std::copy(src_it, src_it_end, std::back_inserter(pairs_)); - // skip seq number 0, which may have special meaning, like zeroed out data - if (seqno == 0) { + if (!orig_empty || max_time_span_ < UINT64_MAX || capacity_ < UINT64_MAX) { + enforced_ = false; + } +} + +bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) { + if (capacity_ == 0) { return false; } - if (!Empty()) { - if (seqno < Last().seqno || time < Last().time) { - return false; - } - if (seqno == Last().seqno) { - // Updating Last() would hurt GetProximalSeqnoBeforeTime() queries, so - // NOT doing it (for now) - return false; - } - if (time == Last().time) { - // Updating Last() here helps GetProximalSeqnoBeforeTime() queries, so - // doing it (for now) - Last().seqno = seqno; - return true; + bool added = false; + if (seqno == 0) { + // skip seq number 0, which may have special meaning, like zeroed out data + // TODO: consider changing? + } else if (pairs_.empty()) { + enforced_ = true; + pairs_.emplace_back(seqno, time); + // skip normal enforced check below + return true; + } else { + auto& last = pairs_.back(); + // We can attempt to merge with the last entry if the new entry sorts with + // it. + if (last.seqno <= seqno) { + bool merged = last.Merge({seqno, time}); + if (!merged) { + if (enforced_ && (seqno <= last.seqno || time <= last.time)) { + // Out of order append should not happen, except in case of clock + // reset + assert(false); + } else { + pairs_.emplace_back(seqno, time); + added = true; + } + } + } else if (!enforced_) { + // Treat like AddUnenforced and fix up below + pairs_.emplace_back(seqno, time); + added = true; + } else { + // Out of order append attempted + assert(false); } } - - pairs_.emplace_back(seqno, time); - - if (pairs_.size() > max_capacity_) { - // FIXME: be smarter about how we erase to avoid data falling off the - // front prematurely. - pairs_.pop_front(); + // Similar to Enforce() but not quite + if (!enforced_) { + SortAndMerge(); + assert(enforced_); } - return true; + EnforceMaxTimeSpan(); + EnforceCapacity(/*strict=*/false); + return added; } bool SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno, @@ -284,64 +479,6 @@ bool SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno, return /*success*/ true; } -bool SeqnoToTimeMapping::Resize(uint64_t min_time_duration, - uint64_t max_time_duration) { - uint64_t new_max_capacity = - CalculateMaxCapacity(min_time_duration, max_time_duration); - if (new_max_capacity == max_capacity_) { - return false; - } else if (new_max_capacity < pairs_.size()) { - uint64_t delta = pairs_.size() - new_max_capacity; - // FIXME: be smarter about how we erase to avoid data falling off the - // front prematurely. - pairs_.erase(pairs_.begin(), pairs_.begin() + delta); - } - max_capacity_ = new_max_capacity; - return true; -} - -Status SeqnoToTimeMapping::Sort() { - if (is_sorted_) { - return Status::OK(); - } - if (pairs_.empty()) { - is_sorted_ = true; - return Status::OK(); - } - - std::deque copy = std::move(pairs_); - - std::sort(copy.begin(), copy.end()); - - pairs_.clear(); - - // remove seqno = 0, which may have special meaning, like zeroed out data - while (copy.front().seqno == 0) { - copy.pop_front(); - } - - SeqnoTimePair prev = copy.front(); - for (const auto& it : copy) { - // If sequence number is the same, pick the one with larger time, which is - // more accurate than the older time. - if (it.seqno == prev.seqno) { - assert(it.time >= prev.time); - prev.time = it.time; - } else { - assert(it.seqno > prev.seqno); - // If a larger sequence number has an older time which is not useful, skip - if (it.time > prev.time) { - pairs_.push_back(prev); - prev = it; - } - } - } - pairs_.emplace_back(prev); - - is_sorted_ = true; - return Status::OK(); -} - std::string SeqnoToTimeMapping::ToHumanString() const { std::string ret; for (const auto& seq_time : pairs_) { @@ -353,25 +490,54 @@ std::string SeqnoToTimeMapping::ToHumanString() const { return ret; } -SeqnoToTimeMapping SeqnoToTimeMapping::Copy( - SequenceNumber smallest_seqno) const { - SeqnoToTimeMapping ret; - auto it = FindGreaterSeqno(smallest_seqno); - if (it != pairs_.begin()) { - it--; - } - std::copy(it, pairs_.end(), std::back_inserter(ret.pairs_)); - return ret; +Slice PackValueAndWriteTime(const Slice& value, uint64_t unix_write_time, + std::string* buf) { + buf->assign(value.data(), value.size()); + PutFixed64(buf, unix_write_time); + return Slice(*buf); } -uint64_t SeqnoToTimeMapping::CalculateMaxCapacity(uint64_t min_time_duration, - uint64_t max_time_duration) { - if (min_time_duration == 0) { - return 0; - } - return std::min( - kMaxSeqnoToTimeEntries, - max_time_duration * kMaxSeqnoTimePairsPerCF / min_time_duration); +Slice PackValueAndSeqno(const Slice& value, SequenceNumber seqno, + std::string* buf) { + buf->assign(value.data(), value.size()); + PutFixed64(buf, seqno); + return Slice(*buf); +} + +uint64_t ParsePackedValueForWriteTime(const Slice& value) { + assert(value.size() >= sizeof(uint64_t)); + Slice write_time_slice(value.data() + value.size() - sizeof(uint64_t), + sizeof(uint64_t)); + uint64_t write_time; + [[maybe_unused]] auto res = GetFixed64(&write_time_slice, &write_time); + assert(res); + return write_time; +} + +std::tuple ParsePackedValueWithWriteTime(const Slice& value) { + return std::make_tuple(Slice(value.data(), value.size() - sizeof(uint64_t)), + ParsePackedValueForWriteTime(value)); +} + +SequenceNumber ParsePackedValueForSeqno(const Slice& value) { + assert(value.size() >= sizeof(SequenceNumber)); + Slice seqno_slice(value.data() + value.size() - sizeof(uint64_t), + sizeof(uint64_t)); + SequenceNumber seqno; + [[maybe_unused]] auto res = GetFixed64(&seqno_slice, &seqno); + assert(res); + return seqno; +} + +std::tuple ParsePackedValueWithSeqno( + const Slice& value) { + return std::make_tuple( + Slice(value.data(), value.size() - sizeof(SequenceNumber)), + ParsePackedValueForSeqno(value)); } +Slice ParsePackedValueForValue(const Slice& value) { + assert(value.size() >= sizeof(uint64_t)); + return Slice(value.data(), value.size() - sizeof(uint64_t)); +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/seqno_to_time_mapping.h b/db/seqno_to_time_mapping.h index 95a4455be18..a9255a806fe 100644 --- a/db/seqno_to_time_mapping.h +++ b/db/seqno_to_time_mapping.h @@ -8,11 +8,13 @@ #include #include +#include #include #include #include #include +#include "db/dbformat.h" #include "rocksdb/status.h" #include "rocksdb/types.h" @@ -21,6 +23,22 @@ namespace ROCKSDB_NAMESPACE { constexpr uint64_t kUnknownTimeBeforeAll = 0; constexpr SequenceNumber kUnknownSeqnoBeforeAll = 0; +// Maximum number of entries can be encoded into SST. The data is delta encode +// so the maximum data usage for each SST is < 0.3K +constexpr uint64_t kMaxSeqnoTimePairsPerSST = 100; + +// Maximum number of entries per CF. If there's only CF with this feature on, +// the max span divided by this number, so for example, if +// preclude_last_level_data_seconds = 100000 (~1day), then it will sample the +// seqno -> time every 1000 seconds (~17minutes). Then the maximum entry it +// needs is 100. +// When there are multiple CFs having this feature on, the sampling cadence is +// determined by the smallest setting, the capacity is determined the largest +// setting, also it's caped by kMaxSeqnoTimePairsPerCF * 10. +constexpr uint64_t kMaxSeqnoTimePairsPerCF = 100; + +constexpr uint64_t kMaxSeqnoToTimeEntries = kMaxSeqnoTimePairsPerCF * 10; + // SeqnoToTimeMapping stores a sampled mapping from sequence numbers to // unix times (seconds since epoch). This information provides rough bounds // between sequence numbers and their write times, but is primarily designed @@ -39,27 +57,16 @@ constexpr SequenceNumber kUnknownSeqnoBeforeAll = 0; // 20 -> 600 // 30 -> 700 // -// In typical operation, the list is sorted, both among seqnos and among times, -// with a bounded number of entries, but some public working states violate -// these constraints. +// In typical operation, the list is in "enforced" operation to maintain +// invariants on sortedness, capacity, and time span of entries. However, some +// operations will put the object into "unenforced" mode where those invariants +// are relaxed until explicitly or implicitly re-enforced (which will sort and +// filter the data). // -// NOT thread safe - requires external synchronization. +// NOT thread safe - requires external synchronization, except a const +// object allows concurrent reads. class SeqnoToTimeMapping { public: - // Maximum number of entries can be encoded into SST. The data is delta encode - // so the maximum data usage for each SST is < 0.3K - static constexpr uint64_t kMaxSeqnoTimePairsPerSST = 100; - - // Maximum number of entries per CF. If there's only CF with this feature on, - // the max duration divided by this number, so for example, if - // preclude_last_level_data_seconds = 100000 (~1day), then it will sample the - // seqno -> time every 1000 seconds (~17minutes). Then the maximum entry it - // needs is 100. - // When there are multiple CFs having this feature on, the sampling cadence is - // determined by the smallest setting, the capacity is determined the largest - // setting, also it's caped by kMaxSeqnoTimePairsPerCF * 10. - static constexpr uint64_t kMaxSeqnoTimePairsPerCF = 100; - // A simple struct for sequence number to time pair struct SeqnoTimePair { SequenceNumber seqno = 0; @@ -86,6 +93,12 @@ class SeqnoToTimeMapping { time += delta_or_base.time; } + // If another pair can be combined into this one (for optimizing + // normal SeqnoToTimeMapping behavior), then this mapping is modified + // and true is returned, indicating the other mapping can be discarded. + // Otherwise false is returned and nothing is changed. + bool Merge(const SeqnoTimePair& other); + // Ordering used for Sort() bool operator<(const SeqnoTimePair& other) const { return std::tie(seqno, time) < std::tie(other.seqno, other.time); @@ -104,27 +117,78 @@ class SeqnoToTimeMapping { } }; - // constractor of SeqnoToTimeMapping - // max_time_duration is the maximum time it should track. For example, if - // preclude_last_level_data_seconds is 1 day, then if an entry is older than 1 - // day, then it can be removed. - // max_capacity is the maximum number of entry it can hold. For single CF, - // it's caped at 100 (kMaxSeqnoTimePairsPerCF), otherwise - // kMaxSeqnoTimePairsPerCF * 10. - // If it's set to 0, means it won't truncate any old data. - explicit SeqnoToTimeMapping(uint64_t max_time_duration = 0, - uint64_t max_capacity = 0) - : max_time_duration_(max_time_duration), max_capacity_(max_capacity) {} - - // Both seqno range and time range are inclusive. ... TODO - // + // Construct an empty SeqnoToTimeMapping with no limits. + SeqnoToTimeMapping() {} + + // ==== Configuration for enforced state ==== // + // Set a time span beyond which old entries can be deleted. Specifically, + // under enforcement mode, the structure will maintian only one entry older + // than the newest entry time minus max_time_span, so that + // GetProximalSeqnoBeforeTime queries back to that time return a good result. + // UINT64_MAX == unlimited. 0 == retain just one latest entry. Returns *this. + SeqnoToTimeMapping& SetMaxTimeSpan(uint64_t max_time_span); + + // Set the nominal capacity under enforcement mode. The structure is allowed + // to grow some reasonable fraction larger but will automatically compact + // down to this size. UINT64_MAX == unlimited. 0 == retain nothing. + // Returns *this. + SeqnoToTimeMapping& SetCapacity(uint64_t capacity); + + // ==== Modifiers, enforced ==== // + // Adds a series of mappings interpolating from from_seqno->from_time to + // to_seqno->to_time. This can only be called on an empty object and both + // seqno range and time range are inclusive. bool PrePopulate(SequenceNumber from_seqno, SequenceNumber to_seqno, uint64_t from_time, uint64_t to_time); - // Append a new entry to the list. The new entry should be newer than the - // existing ones. It maintains the internal sorted status. + // Append a new entry to the list. The `seqno` should be >= all previous + // entries. This operation maintains enforced mode invariants, and will + // automatically (re-)enter enforced mode if not already in that state. + // Returns false if the entry was merged into the most recent entry + // rather than creating a new entry. bool Append(SequenceNumber seqno, uint64_t time); + // Clear all entries and (re-)enter enforced mode if not already in that + // state. Enforced limits are unchanged. + void Clear() { + pairs_.clear(); + enforced_ = true; + } + + // Enters the "enforced" state if not already in that state, which is + // useful before copying or querying. This will + // * Sort the entries + // * Discard any obsolete entries, which is aided if the caller specifies + // the `now` time so that entries older than now minus the max time span can + // be discarded. + // * Compact the entries to the configured capacity. + // Returns *this. + SeqnoToTimeMapping& Enforce(uint64_t now = 0); + + // ==== Modifiers, unenforced ==== // + // Add a new random entry and enter "unenforced" state. Unlike Append(), it + // can be any historical data. + void AddUnenforced(SequenceNumber seqno, uint64_t time); + + // Decode and add the entries to this mapping object. Unless starting from + // an empty mapping with no configured enforcement limits, this operation + // enters the unenforced state. + Status DecodeFrom(const std::string& pairs_str); + + // Copies entries from the src mapping object to this one, limited to entries + // needed to answer GetProximalTimeBeforeSeqno() queries for the given + // *inclusive* seqno range. The source structure must be in enforced + // state as a precondition. Unless starting with this object as empty mapping + // with no configured enforcement limits, this object enters the unenforced + // state. + void CopyFromSeqnoRange(const SeqnoToTimeMapping& src, + SequenceNumber from_seqno, + SequenceNumber to_seqno = kMaxSequenceNumber); + void CopyFrom(const SeqnoToTimeMapping& src) { + CopyFromSeqnoRange(src, kUnknownSeqnoBeforeAll, kMaxSequenceNumber); + } + + // ==== Accessors ==== // // Given a sequence number, return the best (largest / newest) known time // that is no later than the write time of that given sequence number. // If no such specific time is known, returns kUnknownTimeBeforeAll. @@ -133,12 +197,10 @@ class SeqnoToTimeMapping { // GetProximalTimeBeforeSeqno(11) -> 500 // GetProximalTimeBeforeSeqno(20) -> 500 // GetProximalTimeBeforeSeqno(21) -> 600 + // Because this is a const operation depending on sortedness, the structure + // must be in enforced state as a precondition. uint64_t GetProximalTimeBeforeSeqno(SequenceNumber seqno) const; - // Remove any entries not needed for GetProximalSeqnoBeforeTime queries of - // times older than `now - max_time_duration_` - void TruncateOldEntries(uint64_t now); - // Given a time, return the best (largest) sequence number whose write time // is no later than that given time. If no such specific sequence number is // known, returns kUnknownSeqnoBeforeAll. Using the example in the class @@ -147,74 +209,54 @@ class SeqnoToTimeMapping { // GetProximalSeqnoBeforeTime(500) -> 10 // GetProximalSeqnoBeforeTime(599) -> 10 // GetProximalSeqnoBeforeTime(600) -> 20 - SequenceNumber GetProximalSeqnoBeforeTime(uint64_t time); - - // Encode to a binary string. start and end seqno are both inclusive. - void Encode(std::string& des, SequenceNumber start, SequenceNumber end, - uint64_t now, - uint64_t output_size = kMaxSeqnoTimePairsPerSST) const; - - // Add a new random entry, unlike Append(), it can be any data, but also makes - // the list un-sorted. - void Add(SequenceNumber seqno, uint64_t time); + // Because this is a const operation depending on sortedness, the structure + // must be in enforced state as a precondition. + SequenceNumber GetProximalSeqnoBeforeTime(uint64_t time) const; - // Decode and add the entries to the current obj. The list will be unsorted - Status Add(const std::string& pairs_str); + // Encode to a binary string by appending to `dest`. + // Because this is a const operation depending on sortedness, the structure + // must be in enforced state as a precondition. + void EncodeTo(std::string& dest) const; // Return the number of entries size_t Size() const { return pairs_.size(); } - // Reduce the size of internal list - bool Resize(uint64_t min_time_duration, uint64_t max_time_duration); - - // Override the max_time_duration_ - void SetMaxTimeDuration(uint64_t max_time_duration) { - max_time_duration_ = max_time_duration; - } - - uint64_t GetCapacity() const { return max_capacity_; } - - // Sort the list, which also remove the redundant entries, useless entries, - // which makes sure the seqno is sorted, but also the time - Status Sort(); - - // copy the current obj from the given smallest_seqno. - SeqnoToTimeMapping Copy(SequenceNumber smallest_seqno) const; + uint64_t GetCapacity() const { return capacity_; } // If the internal list is empty bool Empty() const { return pairs_.empty(); } - // clear all entries - void Clear() { pairs_.clear(); } - // return the string for user message // Note: Not efficient, okay for print std::string ToHumanString() const; #ifndef NDEBUG + const SeqnoTimePair& TEST_GetLastEntry() const { return pairs_.back(); } const std::deque& TEST_GetInternalMapping() const { return pairs_; } + bool TEST_IsEnforced() const { return enforced_; } #endif private: - static constexpr uint64_t kMaxSeqnoToTimeEntries = - kMaxSeqnoTimePairsPerCF * 10; - - uint64_t max_time_duration_; - uint64_t max_capacity_; + uint64_t max_time_span_ = UINT64_MAX; + uint64_t capacity_ = UINT64_MAX; std::deque pairs_; - bool is_sorted_ = true; + // Whether this object is in the "enforced" state. Between calls to public + // functions, enforced_==true means that + // * `pairs_` is sorted + // * The capacity limit (non-strict) is met + // * The time span limit is met + // However, some places within the implementation (Append()) will temporarily + // violate those last two conditions while enforced_==true. See also the + // Enforce*() and Sort*() private functions below. + bool enforced_ = true; - static uint64_t CalculateMaxCapacity(uint64_t min_time_duration, - uint64_t max_time_duration); - - SeqnoTimePair& Last() { - assert(!Empty()); - return pairs_.back(); - } + void EnforceMaxTimeSpan(uint64_t now = 0); + void EnforceCapacity(bool strict); + void SortAndMerge(); using pair_const_iterator = std::deque::const_iterator; @@ -223,4 +265,34 @@ class SeqnoToTimeMapping { pair_const_iterator FindGreaterEqSeqno(SequenceNumber seqno) const; }; +// === Utility methods used for TimedPut === // + +// Pack a value Slice and a unix write time into buffer `buf` and return a Slice +// for the packed value backed by `buf`. +Slice PackValueAndWriteTime(const Slice& value, uint64_t unix_write_time, + std::string* buf); + +// Pack a value Slice and a sequence number into buffer `buf` and return a Slice +// for the packed value backed by `buf`. +Slice PackValueAndSeqno(const Slice& value, SequenceNumber seqno, + std::string* buf); + +// Parse a packed value to get the write time. +uint64_t ParsePackedValueForWriteTime(const Slice& value); + +// Parse a packed value to get the value and the write time. The unpacked value +// Slice is backed up by the same memory backing up `value`. +std::tuple ParsePackedValueWithWriteTime(const Slice& value); + +// Parse a packed value to get the sequence number. +SequenceNumber ParsePackedValueForSeqno(const Slice& value); + +// Parse a packed value to get the value and the sequence number. The unpacked +// value Slice is backed up by the same memory backing up `value`. +std::tuple ParsePackedValueWithSeqno(const Slice& value); + +// Parse a packed value to get the value. The unpacked value Slice is backed up +// by the same memory backing up `value`. +Slice ParsePackedValueForValue(const Slice& value); + } // namespace ROCKSDB_NAMESPACE diff --git a/db/snapshot_checker.h b/db/snapshot_checker.h index b7ff1df8c01..4a6a71162ec 100644 --- a/db/snapshot_checker.h +++ b/db/snapshot_checker.h @@ -27,7 +27,7 @@ class SnapshotChecker { class DisableGCSnapshotChecker : public SnapshotChecker { public: virtual ~DisableGCSnapshotChecker() {} - virtual SnapshotCheckerResult CheckInSnapshot( + SnapshotCheckerResult CheckInSnapshot( SequenceNumber /*sequence*/, SequenceNumber /*snapshot_sequence*/) const override { // By returning kNotInSnapshot, we prevent all the values from being GCed @@ -48,7 +48,7 @@ class WritePreparedSnapshotChecker : public SnapshotChecker { explicit WritePreparedSnapshotChecker(WritePreparedTxnDB* txn_db); virtual ~WritePreparedSnapshotChecker() {} - virtual SnapshotCheckerResult CheckInSnapshot( + SnapshotCheckerResult CheckInSnapshot( SequenceNumber sequence, SequenceNumber snapshot_sequence) const override; private: diff --git a/db/table_cache.cc b/db/table_cache.cc index b4f0d770563..02956c7c29c 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -85,7 +85,7 @@ TableCache::TableCache(const ImmutableOptions& ioptions, } } -TableCache::~TableCache() {} +TableCache::~TableCache() = default; Status TableCache::GetTableReader( const ReadOptions& ro, const FileOptions& file_options, @@ -395,7 +395,7 @@ uint64_t TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options, bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, size_t prefix_size, GetContext* get_context, - SequenceNumber seq_no) { + Status* read_status, SequenceNumber seq_no) { bool found = false; row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size()); @@ -414,8 +414,8 @@ bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, row_cache.RegisterReleaseAsCleanup(row_handle, value_pinner); // If row cache hit, knowing cache key is the same to row_cache_key, // can use row_cache_key's seq no to construct InternalKey. - replayGetContextLog(*row_cache.Value(row_handle), user_key, get_context, - &value_pinner, seq_no); + *read_status = replayGetContextLog(*row_cache.Value(row_handle), user_key, + get_context, &value_pinner, seq_no); RecordTick(ioptions_.stats, ROW_CACHE_HIT); found = true; } else { @@ -440,21 +440,20 @@ Status TableCache::Get( // Check row cache if enabled. // Reuse row_cache_key sequence number when row cache hits. + Status s; if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { auto user_key = ExtractUserKey(k); uint64_t cache_entry_seq_no = CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key); done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(), - get_context, cache_entry_seq_no); + get_context, &s, cache_entry_seq_no); if (!done) { row_cache_entry = &row_cache_entry_buffer; } } - Status s; TableReader* t = fd.table_reader; TypedHandle* handle = nullptr; - if (!done) { - assert(s.ok()); + if (s.ok() && !done) { if (t == nullptr) { s = FindTable(options, file_options_, internal_comparator, file_meta, &handle, block_protection_bytes_per_key, prefix_extractor, @@ -489,9 +488,8 @@ Status TableCache::Get( s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters); get_context->SetReplayLog(nullptr); } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { - // Couldn't find Table in cache but treat as kFound if no_io set + // Couldn't find table in cache and couldn't open it because of no_io. get_context->MarkKeyMayExist(); - s = Status::OK(); done = true; } } diff --git a/db/table_cache.h b/db/table_cache.h index 5b056f9a9f8..cf3cd25c914 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -273,6 +273,7 @@ class TableCache { // user key to row_cache_key at offset prefix_size bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, size_t prefix_size, GetContext* get_context, + Status* read_status, SequenceNumber seq_no = kMaxSequenceNumber); const ImmutableOptions& ioptions_; diff --git a/db/table_cache_sync_and_async.h b/db/table_cache_sync_and_async.h index 8ff03ec5015..f069c8b8055 100644 --- a/db/table_cache_sync_and_async.h +++ b/db/table_cache_sync_and_async.h @@ -50,8 +50,14 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) GetContext* get_context = miter->get_context; - if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size, - get_context)) { + Status read_status; + bool ret = + GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size, + get_context, &read_status); + if (!read_status.ok()) { + CO_RETURN read_status; + } + if (ret) { table_range.SkipKey(miter); } else { row_cache_entries.emplace_back(); @@ -103,7 +109,6 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) ++miter) { std::string& row_cache_entry = row_cache_entries[row_idx++]; const Slice& user_key = miter->ukey_with_ts; - ; GetContext* get_context = miter->get_context; get_context->SetReplayLog(nullptr); diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h index 968115c3d7a..53aff51cba0 100644 --- a/db/table_properties_collector.h +++ b/db/table_properties_collector.h @@ -17,9 +17,9 @@ namespace ROCKSDB_NAMESPACE { // Base class for internal table properties collector. -class IntTblPropCollector { +class InternalTblPropColl { public: - virtual ~IntTblPropCollector() {} + virtual ~InternalTblPropColl() {} virtual Status Finish(UserCollectedProperties* properties) = 0; virtual const char* Name() const = 0; @@ -39,26 +39,26 @@ class IntTblPropCollector { }; // Factory for internal table properties collector. -class IntTblPropCollectorFactory { +class InternalTblPropCollFactory { public: - virtual ~IntTblPropCollectorFactory() {} + virtual ~InternalTblPropCollFactory() {} // has to be thread-safe - virtual IntTblPropCollector* CreateIntTblPropCollector( + virtual InternalTblPropColl* CreateInternalTblPropColl( uint32_t column_family_id, int level_at_creation) = 0; // The name of the properties collector can be used for debugging purpose. virtual const char* Name() const = 0; }; -using IntTblPropCollectorFactories = - std::vector>; +using InternalTblPropCollFactories = + std::vector>; // When rocksdb creates a new table, it will encode all "user keys" into // "internal keys", which contains meta information of a given entry. // // This class extracts user key from the encoded internal key when Add() is // invoked. -class UserKeyTablePropertiesCollector : public IntTblPropCollector { +class UserKeyTablePropertiesCollector : public InternalTblPropColl { public: // transfer of ownership explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector) @@ -66,45 +66,46 @@ class UserKeyTablePropertiesCollector : public IntTblPropCollector { virtual ~UserKeyTablePropertiesCollector() {} - virtual Status InternalAdd(const Slice& key, const Slice& value, - uint64_t file_size) override; + Status InternalAdd(const Slice& key, const Slice& value, + uint64_t file_size) override; - virtual void BlockAdd(uint64_t block_uncomp_bytes, - uint64_t block_compressed_bytes_fast, - uint64_t block_compressed_bytes_slow) override; + void BlockAdd(uint64_t block_uncomp_bytes, + uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) override; - virtual Status Finish(UserCollectedProperties* properties) override; + Status Finish(UserCollectedProperties* properties) override; - virtual const char* Name() const override { return collector_->Name(); } + const char* Name() const override { return collector_->Name(); } UserCollectedProperties GetReadableProperties() const override; - virtual bool NeedCompact() const override { - return collector_->NeedCompact(); - } + bool NeedCompact() const override { return collector_->NeedCompact(); } protected: std::unique_ptr collector_; }; class UserKeyTablePropertiesCollectorFactory - : public IntTblPropCollectorFactory { + : public InternalTblPropCollFactory { public: explicit UserKeyTablePropertiesCollectorFactory( std::shared_ptr user_collector_factory) : user_collector_factory_(user_collector_factory) {} - virtual IntTblPropCollector* CreateIntTblPropCollector( + InternalTblPropColl* CreateInternalTblPropColl( uint32_t column_family_id, int level_at_creation) override { TablePropertiesCollectorFactory::Context context; context.column_family_id = column_family_id; context.level_at_creation = level_at_creation; - return new UserKeyTablePropertiesCollector( - user_collector_factory_->CreateTablePropertiesCollector(context)); + TablePropertiesCollector* collector = + user_collector_factory_->CreateTablePropertiesCollector(context); + if (collector) { + return new UserKeyTablePropertiesCollector(collector); + } else { + return nullptr; + } } - virtual const char* Name() const override { - return user_collector_factory_->Name(); - } + const char* Name() const override { return user_collector_factory_->Name(); } private: std::shared_ptr user_collector_factory_; @@ -115,7 +116,7 @@ class UserKeyTablePropertiesCollectorFactory // internal key when Add() is invoked. // // @param cmp the user comparator to compare the timestamps in internal key. -class TimestampTablePropertiesCollector : public IntTblPropCollector { +class TimestampTablePropertiesCollector : public InternalTblPropColl { public: explicit TimestampTablePropertiesCollector(const Comparator* cmp) : cmp_(cmp), diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 437b7e30903..a10ebdc24dd 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -46,16 +46,19 @@ void MakeBuilder( const Options& options, const ImmutableOptions& ioptions, const MutableCFOptions& moptions, const InternalKeyComparator& internal_comparator, - const IntTblPropCollectorFactories* int_tbl_prop_collector_factories, + const InternalTblPropCollFactories* internal_tbl_prop_coll_factories, std::unique_ptr* writable, std::unique_ptr* builder) { std::unique_ptr wf(new test::StringSink); writable->reset( new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions())); + const ReadOptions read_options; + const WriteOptions write_options; TableBuilderOptions tboptions( - ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories, - options.compression, options.compression_opts, kTestColumnFamilyId, - kTestColumnFamilyName, kTestLevel); + ioptions, moptions, read_options, write_options, internal_comparator, + internal_tbl_prop_coll_factories, options.compression, + options.compression_opts, kTestColumnFamilyId, kTestColumnFamilyName, + kTestLevel); builder->reset(NewTableBuilder(tboptions, writable->get())); } } // namespace @@ -155,7 +158,7 @@ class RegularKeysStartWithABackwardCompatible uint32_t count_ = 0; }; -class RegularKeysStartWithAInternal : public IntTblPropCollector { +class RegularKeysStartWithAInternal : public InternalTblPropColl { public: const char* Name() const override { return "RegularKeysStartWithA"; } @@ -180,7 +183,6 @@ class RegularKeysStartWithAInternal : public IntTblPropCollector { uint64_t /* block_compressed_bytes_fast */, uint64_t /* block_compressed_bytes_slow */) override { // Nothing to do. - return; } UserCollectedProperties GetReadableProperties() const override { @@ -191,7 +193,7 @@ class RegularKeysStartWithAInternal : public IntTblPropCollector { uint32_t count_ = 0; }; -class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory, +class RegularKeysStartWithAFactory : public InternalTblPropCollFactory, public TablePropertiesCollectorFactory { public: explicit RegularKeysStartWithAFactory(bool backward_mode) @@ -206,7 +208,7 @@ class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory, return new RegularKeysStartWithABackwardCompatible(); } } - IntTblPropCollector* CreateIntTblPropCollector( + InternalTblPropColl* CreateInternalTblPropColl( uint32_t /*column_family_id*/, int /* level_at_creation */) override { return new RegularKeysStartWithAInternal(); } @@ -227,7 +229,7 @@ class FlushBlockEveryThreePolicy : public FlushBlockPolicy { class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory { public: - explicit FlushBlockEveryThreePolicyFactory() {} + explicit FlushBlockEveryThreePolicyFactory() = default; const char* Name() const override { return "FlushBlockEveryThreePolicyFactory"; @@ -240,11 +242,9 @@ class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory { } }; -extern const uint64_t kBlockBasedTableMagicNumber; -extern const uint64_t kPlainTableMagicNumber; namespace { void TestCustomizedTablePropertiesCollector( - bool backward_mode, uint64_t magic_number, bool test_int_tbl_prop_collector, + bool backward_mode, uint64_t magic_number, bool test_internal_tbl_prop_coll, const Options& options, const InternalKeyComparator& internal_comparator) { // make sure the entries will be inserted with order. std::map, std::string> kvs = { @@ -265,15 +265,15 @@ void TestCustomizedTablePropertiesCollector( std::unique_ptr writer; const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; - if (test_int_tbl_prop_collector) { - int_tbl_prop_collector_factories.emplace_back( + InternalTblPropCollFactories internal_tbl_prop_coll_factories; + if (test_internal_tbl_prop_coll) { + internal_tbl_prop_coll_factories.emplace_back( new RegularKeysStartWithAFactory(backward_mode)); } else { - GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories); + GetInternalTblPropCollFactory(ioptions, &internal_tbl_prop_coll_factories); } MakeBuilder(options, ioptions, moptions, internal_comparator, - &int_tbl_prop_collector_factories, &writer, &builder); + &internal_tbl_prop_coll_factories, &writer, &builder); SequenceNumber seqNum = 0U; for (const auto& kv : kvs) { @@ -281,7 +281,7 @@ void TestCustomizedTablePropertiesCollector( builder->Add(ikey.Encode(), kv.second); } ASSERT_OK(builder->Finish()); - ASSERT_OK(writer->Flush()); + ASSERT_OK(writer->Flush(IOOptions())); // -- Step 2: Read properties test::StringSink* fwf = @@ -308,7 +308,7 @@ void TestCustomizedTablePropertiesCollector( ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); ASSERT_EQ(3u, starts_with_A); - if (!backward_mode && !test_int_tbl_prop_collector) { + if (!backward_mode && !test_internal_tbl_prop_coll) { uint32_t num_puts; ASSERT_NE(user_collected.find("NumPuts"), user_collected.end()); Slice key_puts(user_collected.at("NumPuts")); @@ -392,7 +392,7 @@ void TestInternalKeyPropertiesCollector( Options options; test::PlainInternalKeyComparator pikc(options.comparator); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; options.table_factory = table_factory; if (sanitized) { options.table_properties_collector_factories.emplace_back( @@ -406,7 +406,7 @@ void TestInternalKeyPropertiesCollector( options = SanitizeOptions("db", // just a place holder options); ImmutableOptions ioptions(options); - GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories); + GetInternalTblPropCollFactory(ioptions, &internal_tbl_prop_coll_factories); options.comparator = comparator; } const ImmutableOptions ioptions(options); @@ -414,13 +414,13 @@ void TestInternalKeyPropertiesCollector( for (int iter = 0; iter < 2; ++iter) { MakeBuilder(options, ioptions, moptions, pikc, - &int_tbl_prop_collector_factories, &writable, &builder); + &internal_tbl_prop_coll_factories, &writable, &builder); for (const auto& k : keys) { builder->Add(k.Encode(), "val"); } ASSERT_OK(builder->Finish()); - ASSERT_OK(writable->Flush()); + ASSERT_OK(writable->Flush(IOOptions())); test::StringSink* fwf = static_cast(writable->writable_file()); diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h index 6568de23f6c..eb700036110 100644 --- a/db/transaction_log_impl.h +++ b/db/transaction_log_impl.h @@ -63,13 +63,13 @@ class TransactionLogIteratorImpl : public TransactionLogIterator { std::unique_ptr files, VersionSet const* const versions, const bool seq_per_batch, const std::shared_ptr& io_tracer); - virtual bool Valid() override; + bool Valid() override; - virtual void Next() override; + void Next() override; - virtual Status status() override; + Status status() override; - virtual BatchResult GetBatch() override; + BatchResult GetBatch() override; private: const std::string& dir_; @@ -98,7 +98,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator { struct LogReporter : public log::Reader::Reporter { Env* env; Logger* info_log; - virtual void Corruption(size_t bytes, const Status& s) override { + void Corruption(size_t bytes, const Status& s) override { ROCKS_LOG_ERROR(info_log, "dropping %" ROCKSDB_PRIszt " bytes; %s", bytes, s.ToString().c_str()); } diff --git a/db/version_builder.cc b/db/version_builder.cc index 210b0de8694..9a72307d379 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -1073,6 +1073,12 @@ class VersionBuilder::Rep { const uint64_t oldest_blob_file_with_linked_ssts = GetMinOldestBlobFileNumber(); + // If there are no blob files with linked SSTs, meaning that there are no + // valid blob files + if (oldest_blob_file_with_linked_ssts == kInvalidBlobFileNumber) { + return; + } + auto process_base = [vstorage](const std::shared_ptr& base_meta) { assert(base_meta); diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 2ca10c449ce..3c7d8a61d73 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -1194,24 +1194,47 @@ TEST_F(VersionBuilderTest, SaveBlobFilesTo) { ASSERT_OK(second_builder.Apply(&second_edit)); - VersionStorageInfo newer_vstorage( + VersionStorageInfo new_vstorage_2( &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &new_vstorage, force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); - ASSERT_OK(second_builder.SaveTo(&newer_vstorage)); + ASSERT_OK(second_builder.SaveTo(&new_vstorage_2)); - UpdateVersionStorageInfo(&newer_vstorage); + UpdateVersionStorageInfo(&new_vstorage_2); - const auto& newer_blob_files = newer_vstorage.GetBlobFiles(); + const auto& newer_blob_files = new_vstorage_2.GetBlobFiles(); ASSERT_EQ(newer_blob_files.size(), 2); const auto newer_meta3 = - newer_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3); + new_vstorage_2.GetBlobFileMetaData(/* blob_file_number */ 3); ASSERT_EQ(newer_meta3, nullptr); - UnrefFilesInVersion(&newer_vstorage); + // Blob file #5 is referenced by table file #4, and blob file #9 is + // unreferenced. After deleting table file #4, all blob files will become + // unreferenced and will therefore be obsolete. + VersionBuilder third_builder(env_options, &ioptions_, table_cache, + &new_vstorage_2, version_set); + VersionEdit third_edit; + third_edit.DeleteFile(/* level */ 0, /* file_number */ 4); + + ASSERT_OK(third_builder.Apply(&third_edit)); + + VersionStorageInfo new_vstorage_3( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, + &new_vstorage_2, force_consistency_checks, + EpochNumberRequirement::kMightMissing, nullptr, 0, + OffpeakTimeOption(options_.daily_offpeak_time_utc)); + + ASSERT_OK(third_builder.SaveTo(&new_vstorage_3)); + + UpdateVersionStorageInfo(&new_vstorage_3); + + ASSERT_TRUE(new_vstorage_3.GetBlobFiles().empty()); + + UnrefFilesInVersion(&new_vstorage_3); + UnrefFilesInVersion(&new_vstorage_2); UnrefFilesInVersion(&new_vstorage); } diff --git a/db/version_edit.cc b/db/version_edit.cc index 0381855593a..1b10aec522e 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -503,8 +503,7 @@ void VersionEdit::EncodeFileBoundaries(std::string* dst, StripTimestampFromInternalKey(&largest_buf, meta.largest.Encode(), ts_sz); PutLengthPrefixedSlice(dst, smallest_buf); PutLengthPrefixedSlice(dst, largest_buf); - return; -}; +} Status VersionEdit::DecodeFrom(const Slice& src) { Clear(); diff --git a/db/version_edit.h b/db/version_edit.h index eed8509ed67..63464b750b0 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -121,7 +121,7 @@ constexpr uint64_t kUnknownEpochNumber = 0; // will be dedicated to files ingested behind. constexpr uint64_t kReservedEpochNumberForFileIngestedBehind = 1; -extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); +uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); // A copyable structure contains information needed to read data from an SST // file. It can contain a pointer to a table reader opened for the file, or diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index 3e9701d40fc..f1b9d3472f1 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -34,37 +34,36 @@ void VersionEditHandlerBase::Iterate(log::Reader& reader, reader.ReadRecord(&record, &scratch) && log_read_status->ok()) { VersionEdit edit; s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - - s = read_buffer_.AddEdit(&edit); - if (!s.ok()) { - break; + if (s.ok()) { + s = read_buffer_.AddEdit(&edit); } - ColumnFamilyData* cfd = nullptr; - if (edit.IsInAtomicGroup()) { - if (read_buffer_.IsFull()) { - for (auto& e : read_buffer_.replay_buffer()) { - s = ApplyVersionEdit(e, &cfd); - if (!s.ok()) { - break; + if (s.ok()) { + ColumnFamilyData* cfd = nullptr; + if (edit.IsInAtomicGroup()) { + if (read_buffer_.IsFull()) { + s = OnAtomicGroupReplayBegin(); + for (size_t i = 0; s.ok() && i < read_buffer_.replay_buffer().size(); + i++) { + auto& e = read_buffer_.replay_buffer()[i]; + s = ApplyVersionEdit(e, &cfd); + if (s.ok()) { + recovered_edits++; + } + } + if (s.ok()) { + read_buffer_.Clear(); + s = OnAtomicGroupReplayEnd(); } - ++recovered_edits; } - if (!s.ok()) { - break; + } else { + s = ApplyVersionEdit(edit, &cfd); + if (s.ok()) { + recovered_edits++; } - read_buffer_.Clear(); - } - } else { - s = ApplyVersionEdit(edit, &cfd); - if (s.ok()) { - ++recovered_edits; } } } - if (!log_read_status->ok()) { + if (s.ok() && !log_read_status->ok()) { s = *log_read_status; } @@ -746,9 +745,15 @@ Status VersionEditHandler::MaybeHandleFileBoundariesForNewFiles( } std::string smallest_buf; std::string largest_buf; + Slice largest_slice = meta.largest.Encode(); PadInternalKeyWithMinTimestamp(&smallest_buf, meta.smallest.Encode(), ts_sz); - PadInternalKeyWithMinTimestamp(&largest_buf, meta.largest.Encode(), ts_sz); + auto largest_footer = ExtractInternalKeyFooter(largest_slice); + if (largest_footer == kRangeTombstoneSentinel) { + PadInternalKeyWithMaxTimestamp(&largest_buf, largest_slice, ts_sz); + } else { + PadInternalKeyWithMinTimestamp(&largest_buf, largest_slice, ts_sz); + } meta.smallest.DecodeFrom(smallest_buf); meta.largest.DecodeFrom(largest_buf); } @@ -766,12 +771,80 @@ VersionEditHandlerPointInTime::VersionEditHandlerPointInTime( read_options, epoch_number_requirement) {} VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() { + for (const auto& cfid_and_version : atomic_update_versions_) { + delete cfid_and_version.second; + } for (const auto& elem : versions_) { delete elem.second; } versions_.clear(); } +Status VersionEditHandlerPointInTime::OnAtomicGroupReplayBegin() { + if (in_atomic_group_) { + return Status::Corruption("unexpected AtomicGroup start"); + } + + // The AtomicGroup that is about to begin may block column families in a valid + // state from saving any more updates. So we should save any valid states + // before proceeding. + for (const auto& cfid_and_builder : builders_) { + ColumnFamilyData* cfd = version_set_->GetColumnFamilySet()->GetColumnFamily( + cfid_and_builder.first); + assert(!cfd->IsDropped()); + assert(cfd->initialized()); + VersionEdit edit; + Status s = MaybeCreateVersion(edit, cfd, true /* force_create_version */); + if (!s.ok()) { + return s; + } + } + + // An old AtomicGroup is incomplete. Throw away the versions that failed to + // complete it. They must not be used for completing the upcoming + // AtomicGroup since they are too old. + for (auto& cfid_and_version : atomic_update_versions_) { + delete cfid_and_version.second; + } + + in_atomic_group_ = true; + // We lazily assume the column families that exist at this point are all + // involved in the AtomicGroup. Overestimating the scope of the AtomicGroup + // will sometimes cause less data to be recovered, which is fine for + // best-effort recovery. + atomic_update_versions_.clear(); + for (const auto& cfid_and_builder : builders_) { + atomic_update_versions_[cfid_and_builder.first] = nullptr; + } + atomic_update_versions_missing_ = atomic_update_versions_.size(); + return Status::OK(); +} + +Status VersionEditHandlerPointInTime::OnAtomicGroupReplayEnd() { + if (!in_atomic_group_) { + return Status::Corruption("unexpected AtomicGroup end"); + } + in_atomic_group_ = false; + + // The AtomicGroup must not have changed the column families. We don't support + // CF adds or drops in an AtomicGroup. + if (builders_.size() != atomic_update_versions_.size()) { + return Status::Corruption("unexpected CF change in AtomicGroup"); + } + for (const auto& cfid_and_builder : builders_) { + if (atomic_update_versions_.find(cfid_and_builder.first) == + atomic_update_versions_.end()) { + return Status::Corruption("unexpected CF add in AtomicGroup"); + } + } + for (const auto& cfid_and_version : atomic_update_versions_) { + if (builders_.find(cfid_and_version.first) == builders_.end()) { + return Status::Corruption("unexpected CF drop in AtomicGroup"); + } + } + return Status::OK(); +} + void VersionEditHandlerPointInTime::CheckIterationResult( const log::Reader& reader, Status* s) { VersionEditHandler::CheckIterationResult(reader, s); @@ -801,7 +874,14 @@ void VersionEditHandlerPointInTime::CheckIterationResult( ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup( const VersionEdit& edit) { ColumnFamilyData* cfd = VersionEditHandler::DestroyCfAndCleanup(edit); - auto v_iter = versions_.find(edit.GetColumnFamily()); + uint32_t cfid = edit.GetColumnFamily(); + if (AtomicUpdateVersionsContains(cfid)) { + AtomicUpdateVersionsDropCf(cfid); + if (AtomicUpdateVersionsCompleted()) { + AtomicUpdateVersionsApply(); + } + } + auto v_iter = versions_.find(cfid); if (v_iter != versions_.end()) { delete v_iter->second; versions_.erase(v_iter); @@ -902,15 +982,16 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( // Create version before apply edit. The version will represent the state // before applying the version edit. - // A new version will created if: + // A new version will be created if: // 1) no error has occurred so far, and // 2) log_number_, next_file_number_ and last_sequence_ are known, and - // 3) any of the following: + // 3) not in an AtomicGroup + // 4) any of the following: // a) no missing file before, but will have missing file(s) after applying // this version edit. // b) no missing file after applying the version edit, and the caller // explicitly request that a new version be created. - if (s.ok() && !missing_info && + if (s.ok() && !missing_info && !in_atomic_group_ && ((has_missing_files && !prev_has_missing_files) || (!has_missing_files && force_create_version))) { if (!builder) { @@ -939,15 +1020,22 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( } s = builder->SaveTo(version->storage_info()); if (s.ok()) { - version->PrepareAppend( - *cfd->GetLatestMutableCFOptions(), read_options_, - !version_set_->db_options_->skip_stats_update_on_db_open); - auto v_iter = versions_.find(cfd->GetID()); - if (v_iter != versions_.end()) { - delete v_iter->second; - v_iter->second = version; + if (AtomicUpdateVersionsContains(cfd->GetID())) { + AtomicUpdateVersionsPut(version); + if (AtomicUpdateVersionsCompleted()) { + AtomicUpdateVersionsApply(); + } } else { - versions_.emplace(cfd->GetID(), version); + version->PrepareAppend( + *cfd->GetLatestMutableCFOptions(), read_options_, + !version_set_->db_options_->skip_stats_update_on_db_open); + auto v_iter = versions_.find(cfd->GetID()); + if (v_iter != versions_.end()) { + delete v_iter->second; + v_iter->second = version; + } else { + versions_.emplace(cfd->GetID(), version); + } } } else { delete version; @@ -987,6 +1075,60 @@ Status VersionEditHandlerPointInTime::LoadTables( return Status::OK(); } +bool VersionEditHandlerPointInTime::AtomicUpdateVersionsCompleted() { + return atomic_update_versions_missing_ == 0; +} + +bool VersionEditHandlerPointInTime::AtomicUpdateVersionsContains( + uint32_t cfid) { + return atomic_update_versions_.find(cfid) != atomic_update_versions_.end(); +} + +void VersionEditHandlerPointInTime::AtomicUpdateVersionsDropCf(uint32_t cfid) { + assert(!AtomicUpdateVersionsCompleted()); + auto atomic_update_versions_iter = atomic_update_versions_.find(cfid); + assert(atomic_update_versions_iter != atomic_update_versions_.end()); + if (atomic_update_versions_iter->second == nullptr) { + atomic_update_versions_missing_--; + } else { + delete atomic_update_versions_iter->second; + } + atomic_update_versions_.erase(atomic_update_versions_iter); +} + +void VersionEditHandlerPointInTime::AtomicUpdateVersionsPut(Version* version) { + assert(!AtomicUpdateVersionsCompleted()); + auto atomic_update_versions_iter = + atomic_update_versions_.find(version->cfd()->GetID()); + assert(atomic_update_versions_iter != atomic_update_versions_.end()); + if (atomic_update_versions_iter->second == nullptr) { + atomic_update_versions_missing_--; + } else { + delete atomic_update_versions_iter->second; + } + atomic_update_versions_iter->second = version; +} + +void VersionEditHandlerPointInTime::AtomicUpdateVersionsApply() { + assert(AtomicUpdateVersionsCompleted()); + for (const auto& cfid_and_version : atomic_update_versions_) { + uint32_t cfid = cfid_and_version.first; + Version* version = cfid_and_version.second; + assert(version != nullptr); + version->PrepareAppend( + *version->cfd()->GetLatestMutableCFOptions(), read_options_, + !version_set_->db_options_->skip_stats_update_on_db_open); + auto versions_iter = versions_.find(cfid); + if (versions_iter != versions_.end()) { + delete versions_iter->second; + versions_iter->second = version; + } else { + versions_.emplace(cfid, version); + } + } + atomic_update_versions_.clear(); +} + Status ManifestTailer::Initialize() { if (Mode::kRecovery == mode_) { return VersionEditHandler::Initialize(); diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h index af0817e4a17..4caa9c08988 100644 --- a/db/version_edit_handler.h +++ b/db/version_edit_handler.h @@ -40,6 +40,9 @@ class VersionEditHandlerBase { virtual Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) = 0; + virtual Status OnAtomicGroupReplayBegin() { return Status::OK(); } + virtual Status OnAtomicGroupReplayEnd() { return Status::OK(); } + virtual void CheckIterationResult(const log::Reader& /*reader*/, Status* /*s*/) {} @@ -237,8 +240,16 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { ~VersionEditHandlerPointInTime() override; protected: + Status OnAtomicGroupReplayBegin() override; + Status OnAtomicGroupReplayEnd() override; void CheckIterationResult(const log::Reader& reader, Status* s) override; + ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override; + // `MaybeCreateVersion(..., false)` creates a version upon a negative edge + // trigger (transition from valid to invalid). + // + // `MaybeCreateVersion(..., true)` creates a version on a positive level + // trigger (state is valid). Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) override; virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, @@ -251,6 +262,30 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { bool is_initial_load) override; std::unordered_map versions_; + + // `atomic_update_versions_` is for ensuring all-or-nothing AtomicGroup + // recoveries. When `atomic_update_versions_` is nonempty, it serves as a + // barrier to updating `versions_` until all its values are populated. + std::unordered_map atomic_update_versions_; + // `atomic_update_versions_missing_` counts the nullptr values in + // `atomic_update_versions_`. + size_t atomic_update_versions_missing_; + + bool in_atomic_group_ = false; + + private: + bool AtomicUpdateVersionsCompleted(); + bool AtomicUpdateVersionsContains(uint32_t cfid); + void AtomicUpdateVersionsDropCf(uint32_t cfid); + + // This function is called for `Version*` updates for column families in an + // incomplete atomic update. It buffers `Version*` updates in + // `atomic_update_versions_`. + void AtomicUpdateVersionsPut(Version* version); + + // This function is called upon completion of an atomic update. It applies + // `Version*` updates in `atomic_update_versions_` to `versions_`. + void AtomicUpdateVersionsApply(); }; class ManifestTailer : public VersionEditHandlerPointInTime { diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index c7e90c12e98..d01e7b678c1 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -563,7 +563,9 @@ TEST_F(VersionEditTest, AddWalDebug) { std::stringstream ss; ss << "{\"LogNumber\": " << kLogNumbers[i] << ", " << "\"SyncedSizeInBytes\": " << kSizeInBytes[i] << "}"; - if (i < n - 1) ss << ", "; + if (i < n - 1) { + ss << ", "; + } expected_json += ss.str(); } expected_json += "], \"ColumnFamily\": 0}"; diff --git a/db/version_set.cc b/db/version_set.cc index a4bea1a32e6..f3aacbc518c 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -142,7 +142,9 @@ Status OverlapWithIterator(const Comparator* ucmp, ParsedInternalKey seek_result; Status s = ParseInternalKey(iter->key(), &seek_result, false /* log_err_key */); // TODO - if (!s.ok()) return s; + if (!s.ok()) { + return s; + } if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <= 0) { @@ -609,12 +611,26 @@ class FilePickerMultiGet { FdWithKeyRange* f = nullptr; bool file_hit = false; int cmp_largest = -1; + int cmp_smallest = -1; if (curr_file_index >= curr_file_level_->num_files) { // In the unlikely case the next key is a duplicate of the current key, // and the current key is the last in the level and the internal key // was not found, we need to skip lookup for the remaining keys and // reset the search bounds if (batch_iter_ != current_level_range_.end()) { +#ifndef NDEBUG + if (curr_level_ < num_levels_ + 1) { + if ((*level_files_brief_)[curr_level_].num_files == 0) { + struct FilePickerContext& fp_ctx = + fp_ctx_array_[batch_iter_.index()]; + + assert(fp_ctx.search_left_bound == 0); + assert(fp_ctx.search_right_bound == -1 || + fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex); + } + } +#endif // NDBEUG + ++batch_iter_; for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) { struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; @@ -647,7 +663,7 @@ class FilePickerMultiGet { // Check if key is within a file's range. If search left bound and // right bound point to the same find, we are sure key falls in // range. - int cmp_smallest = user_comparator_->CompareWithoutTimestamp( + cmp_smallest = user_comparator_->CompareWithoutTimestamp( user_key, false, ExtractUserKey(f->smallest_key), true); assert(curr_level_ == 0 || @@ -693,6 +709,12 @@ class FilePickerMultiGet { user_comparator_->CompareWithoutTimestamp( batch_iter_->ukey_without_ts, false, upper_key_->ukey_without_ts, false) == 0) { + if (curr_level_ > 0) { + struct FilePickerContext& ctx = fp_ctx_array_[upper_key_.index()]; + file_indexer_->GetNextLevelIndex( + curr_level_, ctx.curr_index_in_curr_level, cmp_smallest, + cmp_largest, &ctx.search_left_bound, &ctx.search_right_bound); + } ++upper_key_; } break; @@ -821,6 +843,8 @@ class FilePickerMultiGet { continue; } } + assert(start_index >= 0); + assert(start_index < static_cast(curr_file_level_->num_files)); fp_ctx.start_index_in_curr_level = start_index; fp_ctx.curr_index_in_curr_level = start_index; } @@ -855,9 +879,9 @@ Version::~Version() { assert(cfd_ != nullptr); uint32_t path_id = f->fd.GetPathId(); assert(path_id < cfd_->ioptions()->cf_paths.size()); - vset_->obsolete_files_.push_back( - ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path, - cfd_->GetFileMetadataCacheReservationManager())); + vset_->obsolete_files_.emplace_back( + f, cfd_->ioptions()->cf_paths[path_id].path, + cfd_->GetFileMetadataCacheReservationManager()); } } } @@ -1641,7 +1665,7 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, std::stringstream ss; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; for (int level = 0; level < storage_info_.num_levels_; level++) { for (const auto& file_meta : storage_info_.files_[level]) { @@ -1713,13 +1737,13 @@ Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options, } Status Version::GetPropertiesOfTablesInRange( - const ReadOptions& read_options, const Range* range, std::size_t n, + const ReadOptions& read_options, const autovector& ranges, TablePropertiesCollection* props) const { for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { - for (decltype(n) i = 0; i < n; i++) { + for (const auto& range : ranges) { // Convert user_key into a corresponding internal key. - InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); - InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek); std::vector files; storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr, false); @@ -2104,7 +2128,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, BeforeFile(ucmp, &largest_user_key, file)) { continue; } - ScopedArenaIterator iter(cfd_->table_cache()->NewIterator( + ScopedArenaPtr iter(cfd_->table_cache()->NewIterator( read_options, file_options, cfd_->internal_comparator(), *file->file_metadata, &range_del_agg, mutable_cf_options_.prefix_extractor, nullptr, @@ -2123,7 +2147,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, } } else if (storage_info_.LevelFilesBrief(level).num_files > 0) { auto mem = arena.AllocateAligned(sizeof(LevelIterator)); - ScopedArenaIterator iter(new (mem) LevelIterator( + ScopedArenaPtr iter(new (mem) LevelIterator( cfd_->table_cache(), read_options, file_options, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), mutable_cf_options_.prefix_extractor, should_sample_file_read(), @@ -2559,8 +2583,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, *status = MergeHelper::TimedFullMerge( merge_operator_, user_key, MergeHelper::kNoBaseValue, merge_context->GetOperands(), info_log_, db_statistics_, clock_, - /* update_num_ops_stats */ true, value ? value->GetSelf() : nullptr, - columns, /* op_failure_scope */ nullptr); + /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr, + value ? value->GetSelf() : nullptr, columns); if (status->ok()) { if (LIKELY(value != nullptr)) { value->PinSelf(); @@ -2825,9 +2849,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, *status = MergeHelper::TimedFullMerge( merge_operator_, user_key, MergeHelper::kNoBaseValue, iter->merge_context.GetOperands(), info_log_, db_statistics_, clock_, - /* update_num_ops_stats */ true, - iter->value ? iter->value->GetSelf() : nullptr, iter->columns, - /* op_failure_scope */ nullptr); + /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr, + iter->value ? iter->value->GetSelf() : nullptr, iter->columns); if (LIKELY(iter->value != nullptr)) { iter->value->PinSelf(); range->AddValueSize(iter->value->size()); @@ -3127,9 +3150,7 @@ void VersionStorageInfo::PrepareForVersionAppend( GenerateFileIndexer(); GenerateLevelFilesBrief(); GenerateLevel0NonOverlapping(); - if (!immutable_options.allow_ingest_behind) { - GenerateBottommostFiles(); - } + GenerateBottommostFiles(); GenerateFileLocationIndex(); } @@ -3138,7 +3159,7 @@ void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options, bool update_stats) { TEST_SYNC_POINT_CALLBACK( "Version::PrepareAppend:forced_check", - reinterpret_cast(&storage_info_.force_consistency_checks_)); + static_cast(&storage_info_.force_consistency_checks_)); if (update_stats) { UpdateAccumulatedStats(read_options); @@ -3162,7 +3183,9 @@ bool Version::MaybeInitializeFileMetaData(const ReadOptions& read_options, file_meta->fd.GetNumber(), s.ToString().c_str()); return false; } - if (tp.get() == nullptr) return false; + if (tp.get() == nullptr) { + return false; + } file_meta->num_entries = tp->num_entries; file_meta->num_deletions = tp->num_deletions; file_meta->raw_value_size = tp->raw_value_size; @@ -4035,13 +4058,19 @@ void SortFileByOverlappingRatio( // This makes the algorithm more deterministic, and also // help the trivial move case to have more files to // extend. - if (file_to_order[f1.file->fd.GetNumber()] == - file_to_order[f2.file->fd.GetNumber()]) { - return icmp.Compare(f1.file->smallest, - f2.file->smallest) < 0; + if (f1.file->marked_for_compaction == + f2.file->marked_for_compaction) { + if (file_to_order[f1.file->fd.GetNumber()] == + file_to_order[f2.file->fd.GetNumber()]) { + return icmp.Compare(f1.file->smallest, + f2.file->smallest) < 0; + } + return file_to_order[f1.file->fd.GetNumber()] < + file_to_order[f2.file->fd.GetNumber()]; + } else { + return f1.file->marked_for_compaction > + f2.file->marked_for_compaction; } - return file_to_order[f1.file->fd.GetNumber()] < - file_to_order[f2.file->fd.GetNumber()]; }); } @@ -4577,15 +4606,19 @@ const char* VersionStorageInfo::LevelSummary( for (int i = 0; i < num_levels(); i++) { int sz = sizeof(scratch->buffer) - len; int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size())); - if (ret < 0 || ret >= sz) break; + if (ret < 0 || ret >= sz) { + break; + } len += ret; } if (len > 0) { // overwrite the last space --len; } - len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, - "] max score %.2f", compaction_score_[0]); + len += + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + "] max score %.2f, estimated pending compaction bytes %" PRIu64, + compaction_score_[0], estimated_compaction_needed_bytes_); if (!files_marked_for_compaction_.empty()) { snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, @@ -4607,7 +4640,9 @@ const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch, "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ", f->fd.GetNumber(), f->fd.smallest_seqno, sztxt, static_cast(f->being_compacted)); - if (ret < 0 || ret >= sz) break; + if (ret < 0 || ret >= sz) { + break; + } len += ret; } // overwrite the last space (only if files_[level].size() is non-zero) @@ -5124,7 +5159,7 @@ VersionSet::VersionSet( BlockCacheTracer* const block_cache_tracer, const std::shared_ptr& io_tracer, const std::string& db_id, const std::string& db_session_id, const std::string& daily_offpeak_time_utc, - ErrorHandler* const error_handler) + ErrorHandler* const error_handler, const bool read_only) : column_family_set_(new ColumnFamilySet( dbname, _db_options, storage_options, table_cache, write_buffer_manager, write_controller, block_cache_tracer, io_tracer, @@ -5151,7 +5186,59 @@ VersionSet::VersionSet( io_tracer_(io_tracer), db_session_id_(db_session_id), offpeak_time_option_(OffpeakTimeOption(daily_offpeak_time_utc)), - error_handler_(error_handler) {} + error_handler_(error_handler), + read_only_(read_only), + closed_(false) {} + +Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) { + Status s; + if (closed_ || read_only_ || !manifest_file_number_ || !descriptor_log_) { + return s; + } + + std::string manifest_file_name = + DescriptorFileName(dbname_, manifest_file_number_); + uint64_t size = 0; + IOStatus io_s = descriptor_log_->Close(WriteOptions()); + descriptor_log_.reset(); + TEST_SYNC_POINT("VersionSet::Close:AfterClose"); + if (io_s.ok()) { + io_s = fs_->GetFileSize(manifest_file_name, IOOptions(), &size, nullptr); + } + if (!io_s.ok() || size != manifest_file_size_) { + if (io_s.ok()) { + // This means the size is not as expected. So we treat it as a + // corruption and set io_s appropriately + io_s = IOStatus::Corruption(); + } + ColumnFamilyData* cfd = GetColumnFamilySet()->GetDefault(); + const ImmutableOptions* ioptions = cfd->ioptions(); + IOErrorInfo io_error_info(io_s, FileOperationType::kVerify, + manifest_file_name, /*length=*/size, + /*offset=*/0); + + for (auto& listener : ioptions->listeners) { + listener->OnIOError(io_error_info); + } + io_s.PermitUncheckedError(); + io_error_info.io_status.PermitUncheckedError(); + ROCKS_LOG_ERROR(db_options_->info_log, + "MANIFEST verification on Close, " + "filename %s, expected size %" PRIu64 + " failed with status %s and " + "actual size %" PRIu64 "\n", + manifest_file_name.c_str(), manifest_file_size_, + io_s.ToString().c_str(), size); + VersionEdit edit; + assert(cfd); + const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions(); + s = LogAndApply(cfd, cf_opts, ReadOptions(), WriteOptions(), &edit, mu, + db_dir); + } + + closed_ = true; + return s; +} VersionSet::~VersionSet() { // we need to delete column_family_set_ because its destructor depends on @@ -5232,8 +5319,8 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, Status VersionSet::ProcessManifestWrites( std::deque& writers, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, - const ColumnFamilyOptions* new_cf_options, - const ReadOptions& read_options) { + const ColumnFamilyOptions* new_cf_options, const ReadOptions& read_options, + const WriteOptions& write_options) { mu->AssertHeld(); assert(!writers.empty()); ManifestWriter& first_writer = writers.front(); @@ -5531,9 +5618,9 @@ Status VersionSet::ProcessManifestWrites( for (const auto* cfd : *column_family_set_) { assert(curr_state.find(cfd->GetID()) == curr_state.end()); - curr_state.emplace(std::make_pair( + curr_state.emplace( cfd->GetID(), - MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow()))); + MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow())); } for (const auto& wal : wals_.GetWals()) { @@ -5595,13 +5682,15 @@ Status VersionSet::ProcessManifestWrites( FileTypeSet tmp_set = db_options_->checksum_handoff_file_types; std::unique_ptr file_writer(new WritableFileWriter( std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_, - io_tracer_, nullptr, db_options_->listeners, nullptr, + io_tracer_, nullptr, Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, + db_options_->listeners, nullptr, tmp_set.Contains(FileType::kDescriptorFile), tmp_set.Contains(FileType::kDescriptorFile))); descriptor_log_.reset( new log::Writer(std::move(file_writer), 0, false)); - s = WriteCurrentStateToManifest(curr_state, wal_additions, - descriptor_log_.get(), io_s); + s = WriteCurrentStateToManifest(write_options, curr_state, + wal_additions, descriptor_log_.get(), + io_s); } else { manifest_io_status = io_s; s = io_s; @@ -5647,7 +5736,7 @@ Status VersionSet::ProcessManifestWrites( } ++idx; #endif /* !NDEBUG */ - io_s = descriptor_log_->AddRecord(record); + io_s = descriptor_log_->AddRecord(write_options, record); if (!io_s.ok()) { s = io_s; manifest_io_status = io_s; @@ -5667,7 +5756,7 @@ Status VersionSet::ProcessManifestWrites( if (s.ok()) { if (!db_options_->disable_manifest_sync) { - io_s = SyncManifest(db_options_, descriptor_log_->file()); + io_s = SyncManifest(db_options_, write_options, descriptor_log_->file()); manifest_io_status = io_s; } TEST_SYNC_POINT_CALLBACK( @@ -5686,7 +5775,8 @@ Status VersionSet::ProcessManifestWrites( assert(manifest_io_status.ok()); } if (s.ok() && new_descriptor_log) { - io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_, + io_s = SetCurrentFile(write_options, fs_.get(), dbname_, + pending_manifest_file_number_, dir_contains_current_file); if (!io_s.ok()) { s = io_s; @@ -5944,7 +6034,7 @@ void VersionSet::WakeUpWaitingManifestWriters() { Status VersionSet::LogAndApply( const autovector& column_family_datas, const autovector& mutable_cf_options_list, - const ReadOptions& read_options, + const ReadOptions& read_options, const WriteOptions& write_options, const autovector>& edit_lists, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options, @@ -6022,8 +6112,8 @@ Status VersionSet::LogAndApply( return Status::ColumnFamilyDropped(); } return ProcessManifestWrites(writers, mu, dir_contains_current_file, - new_descriptor_log, new_cf_options, - read_options); + new_descriptor_log, new_cf_options, read_options, + write_options); } void VersionSet::LogAndApplyCFHelper(VersionEdit* edit, @@ -6363,7 +6453,7 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, Status VersionSet::ListColumnFamiliesFromManifest( const std::string& manifest_path, FileSystem* fs, std::vector* column_families) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; std::unique_ptr file_reader; Status s; @@ -6407,8 +6497,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, "Number of levels needs to be bigger than 1"); } - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; ImmutableDBOptions db_options(*options); ColumnFamilyOptions cf_options(*options); @@ -6420,7 +6511,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/, /*db_id*/ "", /*db_session_id*/ "", options->daily_offpeak_time_utc, - /*error_handler_*/ nullptr); + /*error_handler_*/ nullptr, /*read_only=*/false); Status status; std::vector dummy; @@ -6498,8 +6589,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, InstrumentedMutex dummy_mutex; InstrumentedMutexLock l(&dummy_mutex); return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), - mutable_cf_options, read_options, &ve, - &dummy_mutex, nullptr, true); + mutable_cf_options, read_options, write_options, + &ve, &dummy_mutex, nullptr, true); } // Get the checksum information including the checksum and checksum function @@ -6573,7 +6664,7 @@ Status VersionSet::DumpManifest( Options& options, std::string& dscname, bool verbose, bool hex, bool json, const std::vector& cf_descs) { assert(options.env); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; std::vector column_families; @@ -6640,6 +6731,7 @@ void VersionSet::MarkMinLogNumberToKeep(uint64_t number) { } Status VersionSet::WriteCurrentStateToManifest( + const WriteOptions& write_options, const std::unordered_map& curr_state, const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) { // TODO: Break up into multiple records to reduce memory usage on recovery? @@ -6660,7 +6752,7 @@ Status VersionSet::WriteCurrentStateToManifest( return Status::Corruption("Unable to Encode VersionEdit:" + edit_for_db_id.DebugString(true)); } - io_s = log->AddRecord(db_id_record); + io_s = log->AddRecord(write_options, db_id_record); if (!io_s.ok()) { return io_s; } @@ -6675,7 +6767,7 @@ Status VersionSet::WriteCurrentStateToManifest( return Status::Corruption("Unable to Encode VersionEdit: " + wal_additions.DebugString(true)); } - io_s = log->AddRecord(record); + io_s = log->AddRecord(write_options, record); if (!io_s.ok()) { return io_s; } @@ -6692,7 +6784,7 @@ Status VersionSet::WriteCurrentStateToManifest( return Status::Corruption("Unable to Encode VersionEdit: " + wal_deletions.DebugString(true)); } - io_s = log->AddRecord(wal_deletions_record); + io_s = log->AddRecord(write_options, wal_deletions_record); if (!io_s.ok()) { return io_s; } @@ -6722,7 +6814,7 @@ Status VersionSet::WriteCurrentStateToManifest( return Status::Corruption("Unable to Encode VersionEdit:" + edit.DebugString(true)); } - io_s = log->AddRecord(record); + io_s = log->AddRecord(write_options, record); if (!io_s.ok()) { return io_s; } @@ -6804,7 +6896,7 @@ Status VersionSet::WriteCurrentStateToManifest( return Status::Corruption("Unable to Encode VersionEdit:" + edit.DebugString(true)); } - io_s = log->AddRecord(record); + io_s = log->AddRecord(write_options, record); if (!io_s.ok()) { return io_s; } @@ -6821,7 +6913,7 @@ Status VersionSet::WriteCurrentStateToManifest( return Status::Corruption("Unable to Encode VersionEdit: " + replication_epoch_additions.DebugString(true)); } - io_s = log->AddRecord(record); + io_s = log->AddRecord(write_options, record); if (!io_s.ok()) { return io_s; } @@ -7478,9 +7570,9 @@ ReactiveVersionSet::ReactiveVersionSet( write_buffer_manager, write_controller, /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "", /*db_session_id*/ "", /*daily_offpeak_time_utc*/ "", - /*error_handler=*/nullptr) {} + /*error_handler=*/nullptr, /*read_only=*/true) {} -ReactiveVersionSet::~ReactiveVersionSet() {} +ReactiveVersionSet::~ReactiveVersionSet() = default; Status ReactiveVersionSet::Recover( const std::vector& column_families, diff --git a/db/version_set.h b/db/version_set.h index de4995d9abb..701cfdd7c82 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -99,8 +99,8 @@ using VersionEditParams = VersionEdit; // Return file_level.num_files if there is no such file. // REQUIRES: "file_level.files" contains a sorted list of // non-overlapping files. -extern int FindFile(const InternalKeyComparator& icmp, - const LevelFilesBrief& file_level, const Slice& key); +int FindFile(const InternalKeyComparator& icmp, + const LevelFilesBrief& file_level, const Slice& key); // Returns true iff some file in "files" overlaps the user key range // [*smallest,*largest]. @@ -108,18 +108,18 @@ extern int FindFile(const InternalKeyComparator& icmp, // largest==nullptr represents a key largest than all keys in the DB. // REQUIRES: If disjoint_sorted_files, file_level.files[] // contains disjoint ranges in sorted order. -extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, - bool disjoint_sorted_files, - const LevelFilesBrief& file_level, - const Slice* smallest_user_key, - const Slice* largest_user_key); +bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const LevelFilesBrief& file_level, + const Slice* smallest_user_key, + const Slice* largest_user_key); // Generate LevelFilesBrief from vector // Would copy smallest_key and largest_key data to sequential memory // arena: Arena used to allocate the memory -extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, - const std::vector& files, - Arena* arena); +void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, + const std::vector& files, + Arena* arena); enum EpochNumberRequirement { kMightMissing, kMustPresent, @@ -491,6 +491,12 @@ class VersionStorageInfo { files_marked_for_periodic_compaction_.emplace_back(level, f); } + // REQUIRES: PrepareForVersionAppend has been called + const autovector>& BottommostFiles() const { + assert(finalized_); + return bottommost_files_; + } + // REQUIRES: ComputeCompactionScore has been called // REQUIRES: DB mutex held during access const autovector>& @@ -591,7 +597,9 @@ class VersionStorageInfo { return estimated_compaction_needed_bytes_; } - void TEST_set_estimated_compaction_needed_bytes(uint64_t v) { + void TEST_set_estimated_compaction_needed_bytes(uint64_t v, + InstrumentedMutex* mu) { + InstrumentedMutexLock l(mu); estimated_compaction_needed_bytes_ = v; } @@ -973,7 +981,7 @@ class Version { Status GetPropertiesOfAllTables(const ReadOptions& read_options, TablePropertiesCollection* props, int level); Status GetPropertiesOfTablesInRange(const ReadOptions& read_options, - const Range* range, std::size_t n, + const autovector& ranges, TablePropertiesCollection* props) const; // Print summary of range delete tombstones in SST files into out_str, @@ -1156,22 +1164,25 @@ class VersionSet { const std::shared_ptr& io_tracer, const std::string& db_id, const std::string& db_session_id, const std::string& daily_offpeak_time_utc, - ErrorHandler* const error_handler); + ErrorHandler* const error_handler, const bool read_only); // No copying allowed VersionSet(const VersionSet&) = delete; void operator=(const VersionSet&) = delete; virtual ~VersionSet(); + virtual Status Close(FSDirectory* db_dir, InstrumentedMutex* mu); + Status LogAndApplyToDefaultColumnFamily( - const ReadOptions& read_options, VersionEdit* edit, InstrumentedMutex* mu, + const ReadOptions& read_options, const WriteOptions& write_options, + VersionEdit* edit, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr) { ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault(); const MutableCFOptions* cf_options = default_cf->GetLatestMutableCFOptions(); - return LogAndApply(default_cf, *cf_options, read_options, edit, mu, - dir_contains_current_file, new_descriptor_log, + return LogAndApply(default_cf, *cf_options, read_options, write_options, + edit, mu, dir_contains_current_file, new_descriptor_log, column_family_options); } @@ -1184,7 +1195,8 @@ class VersionSet { Status LogAndApply( ColumnFamilyData* column_family_data, const MutableCFOptions& mutable_cf_options, - const ReadOptions& read_options, VersionEdit* edit, InstrumentedMutex* mu, + const ReadOptions& read_options, const WriteOptions& write_options, + VersionEdit* edit, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr, const std::function& manifest_wcb = {}) { @@ -1196,16 +1208,17 @@ class VersionSet { autovector edit_list; edit_list.emplace_back(edit); edit_lists.emplace_back(edit_list); - return LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, - mu, dir_contains_current_file, new_descriptor_log, - column_family_options, {manifest_wcb}); + return LogAndApply(cfds, mutable_cf_options_list, read_options, + write_options, edit_lists, mu, dir_contains_current_file, + new_descriptor_log, column_family_options, + {manifest_wcb}); } // The batch version. If edit_list.size() > 1, caller must ensure that // no edit in the list column family add or drop Status LogAndApply( ColumnFamilyData* column_family_data, const MutableCFOptions& mutable_cf_options, - const ReadOptions& read_options, + const ReadOptions& read_options, const WriteOptions& write_options, const autovector& edit_list, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr, @@ -1216,9 +1229,10 @@ class VersionSet { mutable_cf_options_list.emplace_back(&mutable_cf_options); autovector> edit_lists; edit_lists.emplace_back(edit_list); - return LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, - mu, dir_contains_current_file, new_descriptor_log, - column_family_options, {manifest_wcb}); + return LogAndApply(cfds, mutable_cf_options_list, read_options, + write_options, edit_lists, mu, dir_contains_current_file, + new_descriptor_log, column_family_options, + {manifest_wcb}); } // The across-multi-cf batch version. If edit_lists contain more than @@ -1227,7 +1241,7 @@ class VersionSet { virtual Status LogAndApply( const autovector& cfds, const autovector& mutable_cf_options_list, - const ReadOptions& read_options, + const ReadOptions& read_options, const WriteOptions& write_options, const autovector>& edit_lists, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, @@ -1570,6 +1584,7 @@ class VersionSet { new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_); constexpr bool update_stats = false; + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; version->PrepareAppend(mutable_cf_options, read_options, update_stats); AppendVersion(cfd, version); @@ -1589,7 +1604,7 @@ class VersionSet { struct LogReporter : public log::Reader::Reporter { Status* status; - virtual void Corruption(size_t /*bytes*/, const Status& s) override { + void Corruption(size_t /*bytes*/, const Status& s) override { if (status->ok()) { *status = s; } @@ -1620,6 +1635,7 @@ class VersionSet { // Save current contents to *log Status WriteCurrentStateToManifest( + const WriteOptions& write_options, const std::unordered_map& curr_state, const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s); @@ -1736,13 +1752,17 @@ class VersionSet { FSDirectory* dir_contains_current_file, bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options, - const ReadOptions& read_options); + const ReadOptions& read_options, + const WriteOptions& write_options); void LogAndApplyCFHelper(VersionEdit* edit, SequenceNumber* max_last_sequence); Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, VersionEdit* edit, SequenceNumber* max_last_sequence, InstrumentedMutex* mu); + + const bool read_only_; + bool closed_; }; // ReactiveVersionSet represents a collection of versions of the column @@ -1760,6 +1780,10 @@ class ReactiveVersionSet : public VersionSet { ~ReactiveVersionSet() override; + Status Close(FSDirectory* /*db_dir*/, InstrumentedMutex* /*mu*/) override { + return Status::OK(); + } + Status ReadAndApply( InstrumentedMutex* mu, std::unique_ptr* manifest_reader, @@ -1788,7 +1812,7 @@ class ReactiveVersionSet : public VersionSet { private: std::unique_ptr manifest_tailer_; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options_; using VersionSet::LogAndApply; using VersionSet::Recover; @@ -1797,6 +1821,7 @@ class ReactiveVersionSet : public VersionSet { const autovector& /*cfds*/, const autovector& /*mutable_cf_options_list*/, const ReadOptions& /* read_options */, + const WriteOptions& /* write_options */, const autovector>& /*edit_lists*/, InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/, bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/, diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 5eb910c9f32..f6a983d6b21 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -34,7 +34,7 @@ class GenerateLevelFilesBriefTest : public testing::Test { LevelFilesBrief file_level_; Arena arena_; - GenerateLevelFilesBriefTest() {} + GenerateLevelFilesBriefTest() = default; ~GenerateLevelFilesBriefTest() override { for (size_t i = 0; i < files_.size(); i++) { @@ -213,7 +213,7 @@ class VersionStorageInfoTest : public VersionStorageInfoTestBase { public: VersionStorageInfoTest() : VersionStorageInfoTestBase(BytewiseComparator()) {} - ~VersionStorageInfoTest() override {} + ~VersionStorageInfoTest() override = default; }; TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) { @@ -520,6 +520,55 @@ TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) { ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize()); } +TEST_F(VersionStorageInfoTest, SingleLevelBottommostData) { + // In case of a single level, the oldest L0 file is bottommost. This could be + // improved in case the L0 files cover disjoint key-ranges. + Add(0 /* level */, 1U /* file_number */, "A" /* smallest */, + "Z" /* largest */, 1U /* file_size */); + Add(0 /* level */, 2U /* file_number */, "A" /* smallest */, + "Z" /* largest */, 1U /* file_size */); + Add(0 /* level */, 3U /* file_number */, "0" /* smallest */, + "9" /* largest */, 1U /* file_size */); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(1, vstorage_.BottommostFiles().size()); + ASSERT_EQ(0, vstorage_.BottommostFiles()[0].first); + ASSERT_EQ(3U, vstorage_.BottommostFiles()[0].second->fd.GetNumber()); +} + +TEST_F(VersionStorageInfoTest, MultiLevelBottommostData) { + // In case of multiple levels, the oldest file for a key-range from each L1+ + // level is bottommost. This could be improved in case an L0 file contains the + // oldest data for some range of keys. + Add(0 /* level */, 1U /* file_number */, "A" /* smallest */, + "Z" /* largest */, 1U /* file_size */); + Add(0 /* level */, 2U /* file_number */, "0" /* smallest */, + "9" /* largest */, 1U /* file_size */); + Add(1 /* level */, 3U /* file_number */, "A" /* smallest */, + "D" /* largest */, 1U /* file_size */); + Add(2 /* level */, 4U /* file_number */, "E" /* smallest */, + "H" /* largest */, 1U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "I" /* smallest */, + "L" /* largest */, 1U /* file_size */); + + UpdateVersionStorageInfo(); + + autovector> bottommost_files = + vstorage_.BottommostFiles(); + std::sort(bottommost_files.begin(), bottommost_files.end(), + [](const std::pair& lhs, + const std::pair& rhs) { + assert(lhs.second); + assert(rhs.second); + return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber(); + }); + ASSERT_EQ(3, bottommost_files.size()); + ASSERT_EQ(3U, bottommost_files[0].second->fd.GetNumber()); + ASSERT_EQ(4U, bottommost_files[1].second->fd.GetNumber()); + ASSERT_EQ(5U, bottommost_files[2].second->fd.GetNumber()); +} + TEST_F(VersionStorageInfoTest, GetOverlappingInputs) { // Two files that overlap at the range deletion tombstone sentinel. Add(1, 1U, {"a", 0, kTypeValue}, @@ -928,7 +977,7 @@ class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase { VersionStorageInfoTimestampTest() : VersionStorageInfoTestBase(test::BytewiseComparatorWithU64TsWrapper()) { } - ~VersionStorageInfoTimestampTest() override {} + ~VersionStorageInfoTimestampTest() override = default; std::string Timestamp(uint64_t ts) const { std::string ret; PutFixed64(&ret, ts); @@ -982,7 +1031,7 @@ class FindLevelFileTest : public testing::Test { FindLevelFileTest() : disjoint_sorted_files_(true) {} - ~FindLevelFileTest() override {} + ~FindLevelFileTest() override = default; void LevelFileInit(size_t num = 0) { char* mem = arena_.AllocateAligned(num * sizeof(FdWithKeyRange)); @@ -1170,6 +1219,7 @@ class VersionSetTestBase { const static std::string kColumnFamilyName1; const static std::string kColumnFamilyName2; const static std::string kColumnFamilyName3; + const static int kNumColumnFamilies = 4; int num_initial_edits_; explicit VersionSetTestBase(const std::string& name) @@ -1183,7 +1233,7 @@ class VersionSetTestBase { table_cache_(NewLRUCache(50000, 16)), write_buffer_manager_(db_options_.db_write_buffer_size), shutting_down_(false), - mock_table_factory_(std::make_shared()) { + table_factory_(std::make_shared()) { EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_)); if (env_ == Env::Default() && getenv("MEM_ENV")) { env_guard_.reset(NewMemEnv(Env::Default())); @@ -1206,7 +1256,7 @@ class VersionSetTestBase { &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); reactive_versions_ = std::make_shared( dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, &write_controller_, nullptr); @@ -1273,31 +1323,93 @@ class VersionSetTestBase { log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); std::string record; new_db.EncodeTo(&record); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); for (const auto& e : new_cfs) { record.clear(); e.EncodeTo(&record); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); } } ASSERT_OK(s); - cf_options_.table_factory = mock_table_factory_; + cf_options_.table_factory = table_factory_; for (const auto& cf_name : cf_names) { column_families->emplace_back(cf_name, cf_options_); } } + struct SstInfo { + uint64_t file_number; + std::string column_family; + std::string key; // the only key + int level = 0; + uint64_t epoch_number; + SstInfo(uint64_t file_num, const std::string& cf_name, + const std::string& _key, + uint64_t _epoch_number = kUnknownEpochNumber) + : SstInfo(file_num, cf_name, _key, 0, _epoch_number) {} + SstInfo(uint64_t file_num, const std::string& cf_name, + const std::string& _key, int lvl, + uint64_t _epoch_number = kUnknownEpochNumber) + : file_number(file_num), + column_family(cf_name), + key(_key), + level(lvl), + epoch_number(_epoch_number) {} + }; + + // Create dummy sst, return their metadata. Note that only file name and size + // are used. + void CreateDummyTableFiles(const std::vector& file_infos, + std::vector* file_metas) { + assert(file_metas != nullptr); + for (const auto& info : file_infos) { + uint64_t file_num = info.file_number; + std::string fname = MakeTableFileName(dbname_, file_num); + std::unique_ptr file; + Status s = fs_->NewWritableFile(fname, FileOptions(), &file, nullptr); + ASSERT_OK(s); + std::unique_ptr fwriter(new WritableFileWriter( + std::move(file), fname, FileOptions(), env_->GetSystemClock().get())); + InternalTblPropCollFactories internal_tbl_prop_coll_factories; + + const ReadOptions read_options; + const WriteOptions write_options; + std::unique_ptr builder(table_factory_->NewTableBuilder( + TableBuilderOptions( + immutable_options_, mutable_cf_options_, read_options, + write_options, InternalKeyComparator(options_.comparator), + &internal_tbl_prop_coll_factories, kNoCompression, + CompressionOptions(), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + info.column_family, info.level), + fwriter.get())); + InternalKey ikey(info.key, 0, ValueType::kTypeValue); + builder->Add(ikey.Encode(), "value"); + ASSERT_OK(builder->Finish()); + ASSERT_OK(fwriter->Flush(IOOptions())); + uint64_t file_size = 0; + s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr); + ASSERT_OK(s); + ASSERT_NE(0, file_size); + file_metas->emplace_back( + file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false, + Temperature::kUnknown, 0, 0, 0, info.epoch_number, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, + 0, 0, /* user_defined_timestamps_persisted */ true); + } + } + // Create DB with 3 column families. void NewDB() { SequenceNumber last_seqno; std::unique_ptr log_writer; - ASSERT_OK(SetIdentityFile(env_, dbname_)); + ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); PrepareManifest(&column_families_, &last_seqno, &log_writer); log_writer.reset(); // Make "CURRENT" file point to the new manifest file. - Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); EXPECT_OK(versions_->Recover(column_families_, false)); @@ -1305,16 +1417,31 @@ class VersionSetTestBase { versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); } + void CloseDB() { + mutex_.Lock(); + versions_->Close(nullptr, &mutex_).PermitUncheckedError(); + versions_.reset(); + mutex_.Unlock(); + } + void ReopenDB() { versions_.reset(new VersionSet( dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); EXPECT_OK(versions_->Recover(column_families_, false)); } + void GetManifestPath(std::string* manifest_path) const { + assert(manifest_path != nullptr); + uint64_t manifest_file_number = 0; + Status s = versions_->GetCurrentManifestPath( + dbname_, fs_.get(), manifest_path, &manifest_file_number); + ASSERT_OK(s); + } + void VerifyManifest(std::string* manifest_path) const { assert(manifest_path != nullptr); uint64_t manifest_file_number = 0; @@ -1328,7 +1455,7 @@ class VersionSetTestBase { mutex_.Lock(); Status s = versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - read_options_, &edit, &mutex_, nullptr); + read_options_, write_options_, &edit, &mutex_, nullptr); mutex_.Unlock(); return s; } @@ -1342,7 +1469,7 @@ class VersionSetTestBase { mutex_.Lock(); Status s = versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - read_options_, vedits, &mutex_, nullptr); + read_options_, write_options_, vedits, &mutex_, nullptr); mutex_.Unlock(); return s; } @@ -1354,7 +1481,8 @@ class VersionSetTestBase { VersionEdit dummy; ASSERT_OK(versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - read_options_, &dummy, &mutex_, db_directory, new_descriptor_log)); + read_options_, write_options_, &dummy, &mutex_, db_directory, + new_descriptor_log)); mutex_.Unlock(); } @@ -1372,7 +1500,7 @@ class VersionSetTestBase { mutex_.Lock(); s = versions_->LogAndApply(/*column_family_data=*/nullptr, MutableCFOptions(cf_options), read_options_, - &new_cf, &mutex_, + write_options_, &new_cf, &mutex_, /*db_directory=*/nullptr, /*new_descriptor_log=*/false, &cf_options); mutex_.Unlock(); @@ -1395,6 +1523,8 @@ class VersionSetTestBase { ImmutableOptions immutable_options_; MutableCFOptions mutable_cf_options_; const ReadOptions read_options_; + const WriteOptions write_options_; + std::shared_ptr table_cache_; WriteController write_controller_; WriteBufferManager write_buffer_manager_; @@ -1402,7 +1532,7 @@ class VersionSetTestBase { std::shared_ptr reactive_versions_; InstrumentedMutex mutex_; std::atomic shutting_down_; - std::shared_ptr mock_table_factory_; + std::shared_ptr table_factory_; std::vector column_families_; }; @@ -1419,6 +1549,7 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { NewDB(); const int kGroupSize = 5; const ReadOptions read_options; + const WriteOptions write_options; autovector edits; for (int i = 0; i != kGroupSize; ++i) { @@ -1440,14 +1571,15 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { int count = 0; SyncPoint::GetInstance()->SetCallBack( "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) { - uint32_t* cf_id = reinterpret_cast(arg); + uint32_t* cf_id = static_cast(arg); EXPECT_EQ(0u, *cf_id); ++count; }); SyncPoint::GetInstance()->EnableProcessing(); mutex_.Lock(); - Status s = versions_->LogAndApply(cfds, all_mutable_cf_options, read_options, - edit_lists, &mutex_, nullptr); + Status s = + versions_->LogAndApply(cfds, all_mutable_cf_options, read_options, + write_options, edit_lists, &mutex_, nullptr); mutex_.Unlock(); EXPECT_OK(s); EXPECT_EQ(kGroupSize - 1, count); @@ -1649,7 +1781,7 @@ TEST_F(VersionSetTest, ObsoleteBlobFile) { mutex_.Lock(); Status s = versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - read_options_, &edit, &mutex_, nullptr); + read_options_, write_options_, &edit, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); @@ -1716,7 +1848,7 @@ TEST_F(VersionSetTest, WalEditsNotAppliedToVersion) { autovector versions; SyncPoint::GetInstance()->SetCallBack( "VersionSet::ProcessManifestWrites:NewVersion", - [&](void* arg) { versions.push_back(reinterpret_cast(arg)); }); + [&](void* arg) { versions.push_back(static_cast(arg)); }); SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(LogAndApplyToDefaultCF(edits)); @@ -1752,7 +1884,7 @@ TEST_F(VersionSetTest, NonWalEditsAppliedToVersion) { autovector versions; SyncPoint::GetInstance()->SetCallBack( "VersionSet::ProcessManifestWrites:NewVersion", - [&](void* arg) { versions.push_back(reinterpret_cast(arg)); }); + [&](void* arg) { versions.push_back(static_cast(arg)); }); SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(LogAndApplyToDefaultCF(edits)); @@ -1824,7 +1956,7 @@ TEST_F(VersionSetTest, WalAddition) { &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 1); @@ -1892,7 +2024,7 @@ TEST_F(VersionSetTest, WalCloseWithoutSync) { &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 2); @@ -1946,7 +2078,7 @@ TEST_F(VersionSetTest, WalDeletion) { &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 1); @@ -1960,7 +2092,7 @@ TEST_F(VersionSetTest, WalDeletion) { std::vector wal_additions; SyncPoint::GetInstance()->SetCallBack( "VersionSet::WriteCurrentStateToManifest:SaveWal", [&](void* arg) { - VersionEdit* edit = reinterpret_cast(arg); + VersionEdit* edit = static_cast(arg); ASSERT_TRUE(edit->IsWalAddition()); for (auto& addition : edit->GetWalAdditions()) { wal_additions.push_back(addition); @@ -1985,7 +2117,7 @@ TEST_F(VersionSetTest, WalDeletion) { &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 1); @@ -2106,7 +2238,7 @@ TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) { &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 1); @@ -2143,7 +2275,7 @@ TEST_F(VersionSetTest, DeleteAllWals) { &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 0); @@ -2186,7 +2318,7 @@ TEST_F(VersionSetTest, AtomicGroupWithWalEdits) { &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); std::string db_id; ASSERT_OK( new_versions->Recover(column_families_, /*read_only=*/false, &db_id)); @@ -2294,6 +2426,28 @@ TEST_F(VersionSetTest, OffpeakTimeInfoTest) { versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); } +TEST_F(VersionSetTest, ManifestTruncateAfterClose) { + std::string manifest_path; + VersionEdit edit; + + NewDB(); + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::Close:AfterClose", [&](void*) { + GetManifestPath(&manifest_path); + std::unique_ptr manifest_file; + EXPECT_OK(env_->ReopenWritableFile(manifest_path, &manifest_file, + EnvOptions())); + EXPECT_OK(manifest_file->Truncate(0)); + EXPECT_OK(manifest_file->Close()); + }); + SyncPoint::GetInstance()->EnableProcessing(); + CloseDB(); + SyncPoint::GetInstance()->DisableProcessing(); + + ReopenDB(); +} + TEST_F(VersionStorageInfoTest, AddRangeDeletionCompensatedFileSize) { // Tests that compensated range deletion size is added to compensated file // size. @@ -2345,7 +2499,7 @@ class VersionSetWithTimestampTest : public VersionSetTest { &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false, /*db_id=*/nullptr)); for (auto* cfd : *(vset->GetColumnFamilySet())) { @@ -2368,7 +2522,8 @@ class VersionSetWithTimestampTest : public VersionSetTest { Status s; mutex_.Lock(); s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()), - read_options_, edits_, &mutex_, nullptr); + read_options_, write_options_, edits_, &mutex_, + nullptr); mutex_.Unlock(); ASSERT_OK(s); VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end())); @@ -2414,6 +2569,9 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, VersionSetAtomicGroupTest() : VersionSetTestBase("version_set_atomic_group_test") {} + explicit VersionSetAtomicGroupTest(const std::string& name) + : VersionSetTestBase(name) {} + void SetUp() override { PrepareManifest(&column_families_, &last_seqno_, &log_writer_); SetupTestSyncPoints(); @@ -2428,7 +2586,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, edits_[i].MarkAtomicGroup(--remaining); edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); + ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); } void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) { @@ -2440,7 +2598,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, edits_[i].MarkAtomicGroup(--remaining); edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); + ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); } void SetupCorruptedAtomicGroup(int atomic_group_size) { @@ -2454,7 +2612,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, } edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); + ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); } void SetupIncorrectAtomicGroup(int atomic_group_size) { @@ -2470,7 +2628,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, } edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); + ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr)); } void SetupTestSyncPoints() { @@ -2478,36 +2636,32 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) { - VersionEdit* e = reinterpret_cast(arg); + VersionEdit* e = static_cast(arg); EXPECT_EQ(edits_.front().DebugString(), e->DebugString()); // compare based on value first_in_atomic_group_ = true; }); SyncPoint::GetInstance()->SetCallBack( "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) { - VersionEdit* e = reinterpret_cast(arg); + VersionEdit* e = static_cast(arg); EXPECT_EQ(edits_.back().DebugString(), e->DebugString()); // compare based on value EXPECT_TRUE(first_in_atomic_group_); last_in_atomic_group_ = true; }); SyncPoint::GetInstance()->SetCallBack( - "VersionEditHandlerBase::Iterate:Finish", [&](void* arg) { - num_recovered_edits_ = *reinterpret_cast(arg); - }); + "VersionEditHandlerBase::Iterate:Finish", + [&](void* arg) { num_recovered_edits_ = *static_cast(arg); }); SyncPoint::GetInstance()->SetCallBack( "AtomicGroupReadBuffer::AddEdit:AtomicGroup", [&](void* /* arg */) { ++num_edits_in_atomic_group_; }); SyncPoint::GetInstance()->SetCallBack( "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", - [&](void* arg) { - corrupted_edit_ = *reinterpret_cast(arg); - }); + [&](void* arg) { corrupted_edit_ = *static_cast(arg); }); SyncPoint::GetInstance()->SetCallBack( "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", [&](void* arg) { - edit_with_incorrect_group_size_ = - *reinterpret_cast(arg); + edit_with_incorrect_group_size_ = *static_cast(arg); }); SyncPoint::GetInstance()->EnableProcessing(); } @@ -2515,8 +2669,8 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, void AddNewEditsToLog(int num_edits) { for (int i = 0; i < num_edits; i++) { std::string record; - edits_[i].EncodeTo(&record); - ASSERT_OK(log_writer_->AddRecord(record)); + edits_[i].EncodeTo(&record, 0 /* ts_sz */); + ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record)); } } @@ -2638,7 +2792,7 @@ TEST_F(VersionSetAtomicGroupTest, // edits. std::string last_record; edits_[kAtomicGroupSize - 1].EncodeTo(&last_record); - EXPECT_OK(log_writer_->AddRecord(last_record)); + EXPECT_OK(log_writer_->AddRecord(WriteOptions(), last_record)); InstrumentedMutex mu; std::unordered_set cfds_changed; mu.Lock(); @@ -2784,6 +2938,425 @@ TEST_F(VersionSetAtomicGroupTest, edit_with_incorrect_group_size_.DebugString()); } +class AtomicGroupBestEffortRecoveryTest : public VersionSetAtomicGroupTest { + public: + AtomicGroupBestEffortRecoveryTest() + : VersionSetAtomicGroupTest("atomic_group_best_effort_recovery_test") {} +}; + +TEST_F(AtomicGroupBestEffortRecoveryTest, + HandleAtomicGroupUpdatesValidInitially) { + // One AtomicGroup contains updates that are valid at the outset. + std::vector file_infos; + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + int file_number = 10 + cfid; + file_infos.emplace_back(file_number, column_families_[cfid].name, + "" /* key */, 0 /* level */, + file_number /* epoch_number */); + } + + std::vector file_metas; + CreateDummyTableFiles(file_infos, &file_metas); + + edits_.clear(); + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + edits_.emplace_back(); + edits_.back().SetColumnFamily(cfid); + edits_.back().AddFile(0 /* level */, file_metas[cfid]); + edits_.back().SetLastSequence(++last_seqno_); + edits_.back().MarkAtomicGroup(kNumColumnFamilies - 1 - + cfid /* remaining_entries */); + } + AddNewEditsToLog(kNumColumnFamilies); + + { + bool has_missing_table_file; + ASSERT_OK(versions_->TryRecover(column_families_, false /* read_only */, + {DescriptorFileName(1 /* number */)}, + nullptr /* db_id */, + &has_missing_table_file)); + ASSERT_FALSE(has_missing_table_file); + } + std::vector all_table_files; + std::vector all_blob_files; + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + ASSERT_EQ(file_metas.size(), all_table_files.size()); +} + +TEST_F(AtomicGroupBestEffortRecoveryTest, HandleAtomicGroupUpdatesValidLater) { + // One AtomicGroup contains updates that become valid after applying further + // updates. + + // `SetupTestSyncPoints()` creates sync points that assume there is only one + // AtomicGroup, which is not the case in this test. + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + std::vector file_infos; + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + int file_number = 10 + cfid; + file_infos.emplace_back(file_number, column_families_[cfid].name, + "" /* key */, 0 /* level */, + file_number /* epoch_number */); + } + + std::vector file_metas; + CreateDummyTableFiles(file_infos, &file_metas); + + edits_.clear(); + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + if (cfid == kNumColumnFamilies - 1) { + // Corrupt the number of the last file. + file_metas[cfid].fd.packed_number_and_path_id = + PackFileNumberAndPathId(20 /* number */, 0 /* path_id */); + } + edits_.emplace_back(); + edits_.back().SetColumnFamily(cfid); + edits_.back().AddFile(0 /* level */, file_metas[cfid]); + edits_.back().SetLastSequence(++last_seqno_); + edits_.back().MarkAtomicGroup(kNumColumnFamilies - 1 - + cfid /* remaining_entries */); + } + AddNewEditsToLog(kNumColumnFamilies); + + { + // Delete the file with the corrupted number. + VersionEdit fixup_edit; + fixup_edit.SetColumnFamily(kNumColumnFamilies - 1); + fixup_edit.DeleteFile(0 /* level */, 20 /* number */); + assert(log_writer_.get() != nullptr); + std::string record; + ASSERT_TRUE(fixup_edit.EncodeTo(&record, 0 /* ts_sz */)); + ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record)); + + // Throw in an impossible AtomicGroup afterwards for extra challenge. + VersionEdit broken_edit; + broken_edit.SetColumnFamily(0 /* column_family_id */); + file_metas[0].fd.packed_number_and_path_id = + PackFileNumberAndPathId(30 /* number */, 0 /* path_id */); + broken_edit.AddFile(0 /* level */, file_metas[0]); + broken_edit.SetLastSequence(++last_seqno_); + broken_edit.MarkAtomicGroup(0 /* remaining_entries */); + record.clear(); + ASSERT_TRUE(broken_edit.EncodeTo(&record, 0 /* ts_sz */)); + ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record)); + assert(log_writer_.get() != nullptr); + } + + { + bool has_missing_table_file = false; + ASSERT_OK(versions_->TryRecover(column_families_, false /* read_only */, + {DescriptorFileName(1 /* number */)}, + nullptr /* db_id */, + &has_missing_table_file)); + ASSERT_TRUE(has_missing_table_file); + } + std::vector all_table_files; + std::vector all_blob_files; + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + ASSERT_EQ(file_metas.size() - 1, all_table_files.size()); +} + +TEST_F(AtomicGroupBestEffortRecoveryTest, HandleAtomicGroupUpdatesInvalid) { + // One AtomicGroup contains updates that never become valid. + std::vector file_infos; + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + int file_number = 10 + cfid; + file_infos.emplace_back(file_number, column_families_[cfid].name, + "" /* key */, 0 /* level */, + file_number /* epoch_number */); + } + + std::vector file_metas; + CreateDummyTableFiles(file_infos, &file_metas); + + edits_.clear(); + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + if (cfid == kNumColumnFamilies - 1) { + // Corrupt the number of the last file. + file_metas[cfid].fd.packed_number_and_path_id = + PackFileNumberAndPathId(20 /* number */, 0 /* path_id */); + } + edits_.emplace_back(); + edits_.back().SetColumnFamily(cfid); + edits_.back().AddFile(0 /* level */, file_metas[cfid]); + edits_.back().SetLastSequence(++last_seqno_); + edits_.back().MarkAtomicGroup(kNumColumnFamilies - 1 - + cfid /* remaining_entries */); + } + AddNewEditsToLog(kNumColumnFamilies); + + { + bool has_missing_table_file = false; + ASSERT_OK(versions_->TryRecover(column_families_, false /* read_only */, + {DescriptorFileName(1 /* number */)}, + nullptr /* db_id */, + &has_missing_table_file)); + ASSERT_TRUE(has_missing_table_file); + } + std::vector all_table_files; + std::vector all_blob_files; + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + ASSERT_TRUE(all_table_files.empty()); +} + +TEST_F(AtomicGroupBestEffortRecoveryTest, + HandleAtomicGroupUpdatesValidTooLate) { + // One AtomicGroup contains updates that become valid after the next + // AtomicGroup is reached, which is too late. + + // `SetupTestSyncPoints()` creates sync points that assume there is only one + // AtomicGroup, which is not the case in this test. + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + std::vector file_infos; + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + int file_number = 10 + cfid; + file_infos.emplace_back(file_number, column_families_[cfid].name, + "" /* key */, 0 /* level */, + file_number /* epoch_number */); + } + + std::vector file_metas; + CreateDummyTableFiles(file_infos, &file_metas); + + edits_.clear(); + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + if (cfid == kNumColumnFamilies - 1) { + // Corrupt the number of the last file. + file_metas[cfid].fd.packed_number_and_path_id = + PackFileNumberAndPathId(20 /* number */, 0 /* path_id */); + } + edits_.emplace_back(); + edits_.back().SetColumnFamily(cfid); + edits_.back().AddFile(0 /* level */, file_metas[cfid]); + edits_.back().SetLastSequence(++last_seqno_); + edits_.back().MarkAtomicGroup(kNumColumnFamilies - 1 - + cfid /* remaining_entries */); + } + AddNewEditsToLog(kNumColumnFamilies); + + { + // Delete the file with the corrupted number. But bundle it in an + // AtomicGroup with an update that can never be applied. + VersionEdit broken_edit; + broken_edit.SetColumnFamily(0 /* column_family_id */); + file_metas[0].fd.packed_number_and_path_id = + PackFileNumberAndPathId(30 /* number */, 0 /* path_id */); + broken_edit.AddFile(0 /* level */, file_metas[0]); + broken_edit.SetLastSequence(++last_seqno_); + broken_edit.MarkAtomicGroup(1 /* remaining_entries */); + std::string record; + ASSERT_TRUE(broken_edit.EncodeTo(&record, 0 /* ts_sz */)); + ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record)); + + VersionEdit fixup_edit; + fixup_edit.SetColumnFamily(kNumColumnFamilies - 1); + fixup_edit.DeleteFile(0 /* level */, 20 /* number */); + fixup_edit.MarkAtomicGroup(0 /* remaining_entries */); + record.clear(); + ASSERT_TRUE(fixup_edit.EncodeTo(&record, 0 /* ts_sz */)); + ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record)); + assert(log_writer_.get() != nullptr); + } + + { + bool has_missing_table_file = false; + ASSERT_OK(versions_->TryRecover(column_families_, false /* read_only */, + {DescriptorFileName(1 /* number */)}, + nullptr /* db_id */, + &has_missing_table_file)); + ASSERT_TRUE(has_missing_table_file); + } + std::vector all_table_files; + std::vector all_blob_files; + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + ASSERT_TRUE(all_table_files.empty()); +} + +TEST_F(AtomicGroupBestEffortRecoveryTest, + HandleAtomicGroupUpdatesInDuplicateInvalid) { + // One AtomicGroup has multiple updates for the same CF. One of the earlier + // updates for this CF can lead to a valid state if applied. But the last + // update for this CF is invalid so the AtomicGroup must not be recovered. + std::vector file_infos; + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + int file_number = 10 + cfid; + file_infos.emplace_back(file_number, column_families_[cfid].name, + "" /* key */, 0 /* level */, + file_number /* epoch_number */); + } + + std::vector file_metas; + CreateDummyTableFiles(file_infos, &file_metas); + + edits_.clear(); + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + edits_.emplace_back(); + edits_.back().SetColumnFamily(cfid); + edits_.back().AddFile(0 /* level */, file_metas[cfid]); + edits_.back().SetLastSequence(++last_seqno_); + edits_.back().MarkAtomicGroup(kNumColumnFamilies - + cfid /* remaining_entries */); + } + // Here is the unrecoverable update. + edits_.emplace_back(); + edits_.back().SetColumnFamily(0 /* column_family_id */); + file_metas[0].fd.packed_number_and_path_id = + PackFileNumberAndPathId(20 /* number */, 0 /* path_id */); + edits_.back().AddFile(0 /* level */, file_metas[0]); + edits_.back().SetLastSequence(++last_seqno_); + edits_.back().MarkAtomicGroup(0 /* remaining_entries */); + AddNewEditsToLog(kNumColumnFamilies + 1); + + { + bool has_missing_table_file = false; + ASSERT_OK(versions_->TryRecover(column_families_, false /* read_only */, + {DescriptorFileName(1 /* number */)}, + nullptr /* db_id */, + &has_missing_table_file)); + ASSERT_TRUE(has_missing_table_file); + } + std::vector all_table_files; + std::vector all_blob_files; + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + ASSERT_TRUE(all_table_files.empty()); +} + +TEST_F(AtomicGroupBestEffortRecoveryTest, + HandleAtomicGroupMadeWholeByDeletingCf) { + // One AtomicGroup contains an update that becomes valid when its column + // family is deleted, making it irrelevant. + std::vector file_infos; + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + int file_number = 10 + cfid; + file_infos.emplace_back(file_number, column_families_[cfid].name, + "" /* key */, 0 /* level */, + file_number /* epoch_number */); + } + + std::vector file_metas; + CreateDummyTableFiles(file_infos, &file_metas); + + edits_.clear(); + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + if (cfid == kNumColumnFamilies - 1) { + // Corrupt the number of the last file. + file_metas[cfid].fd.packed_number_and_path_id = + PackFileNumberAndPathId(20 /* number */, 0 /* path_id */); + } + edits_.emplace_back(); + edits_.back().SetColumnFamily(cfid); + edits_.back().AddFile(0 /* level */, file_metas[cfid]); + edits_.back().SetLastSequence(++last_seqno_); + edits_.back().MarkAtomicGroup(kNumColumnFamilies - 1 - + cfid /* remaining_entries */); + } + AddNewEditsToLog(kNumColumnFamilies); + + { + // Delete the column family with the corrupted file number. + VersionEdit fixup_edit; + fixup_edit.DropColumnFamily(); + fixup_edit.SetColumnFamily(kNumColumnFamilies - 1); + assert(log_writer_.get() != nullptr); + std::string record; + ASSERT_TRUE(fixup_edit.EncodeTo(&record, 0 /* ts_sz */)); + ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record)); + } + + { + bool has_missing_table_file = false; + ASSERT_OK(versions_->TryRecover(column_families_, false /* read_only */, + {DescriptorFileName(1 /* number */)}, + nullptr /* db_id */, + &has_missing_table_file)); + ASSERT_FALSE(has_missing_table_file); + } + std::vector all_table_files; + std::vector all_blob_files; + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + ASSERT_EQ(file_metas.size() - 1, all_table_files.size()); +} + +TEST_F(AtomicGroupBestEffortRecoveryTest, + HandleAtomicGroupMadeWholeAfterNewCf) { + // One AtomicGroup contains updates that become valid after a new column + // family is added. + std::vector file_infos; + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + int file_number = 10 + cfid; + file_infos.emplace_back(file_number, column_families_[cfid].name, + "" /* key */, 0 /* level */, + file_number /* epoch_number */); + } + + std::vector file_metas; + CreateDummyTableFiles(file_infos, &file_metas); + + edits_.clear(); + for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) { + if (cfid == kNumColumnFamilies - 1) { + // Corrupt the number of the last file. + file_metas[cfid].fd.packed_number_and_path_id = + PackFileNumberAndPathId(20 /* number */, 0 /* path_id */); + } + edits_.emplace_back(); + edits_.back().SetColumnFamily(cfid); + edits_.back().AddFile(0 /* level */, file_metas[cfid]); + edits_.back().SetLastSequence(++last_seqno_); + edits_.back().MarkAtomicGroup(kNumColumnFamilies - 1 - + cfid /* remaining_entries */); + } + AddNewEditsToLog(kNumColumnFamilies); + + { + // Add a new CF. + VersionEdit add_cf_edit; + add_cf_edit.AddColumnFamily("extra_cf"); + add_cf_edit.SetColumnFamily(kNumColumnFamilies); + std::string record; + ASSERT_TRUE(add_cf_edit.EncodeTo(&record, 0 /* ts_sz */)); + ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record)); + + // Have the new CF refer to a non-existent file for an extra challenge. + VersionEdit broken_edit; + broken_edit.SetColumnFamily(kNumColumnFamilies); + file_metas[0].fd.packed_number_and_path_id = + PackFileNumberAndPathId(30 /* number */, 0 /* path_id */); + broken_edit.AddFile(0 /* level */, file_metas[0]); + broken_edit.SetLastSequence(++last_seqno_); + record.clear(); + ASSERT_TRUE(broken_edit.EncodeTo(&record, 0 /* ts_sz */)); + ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record)); + + // This fixes up the first of the two non-existent file references. + VersionEdit fixup_edit; + fixup_edit.SetColumnFamily(kNumColumnFamilies - 1); + fixup_edit.DeleteFile(0 /* level */, 20 /* number */); + record.clear(); + ASSERT_TRUE(fixup_edit.EncodeTo(&record, 0 /* ts_sz */)); + ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record)); + assert(log_writer_.get() != nullptr); + } + + { + bool has_missing_table_file = false; + std::vector column_families = column_families_; + column_families.emplace_back("extra_cf", cf_options_); + ASSERT_OK(versions_->TryRecover(column_families, false /* read_only */, + {DescriptorFileName(1 /* number */)}, + nullptr /* db_id */, + &has_missing_table_file)); + ASSERT_TRUE(has_missing_table_file); + } + std::vector all_table_files; + std::vector all_blob_files; + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + ASSERT_EQ(file_metas.size() - 1, all_table_files.size()); +} + class VersionSetTestDropOneCF : public VersionSetTestBase, public testing::TestWithParam { public: @@ -2810,12 +3383,13 @@ class VersionSetTestDropOneCF : public VersionSetTestBase, // last column family in an atomic group. TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { const ReadOptions read_options; + const WriteOptions write_options; std::vector column_families; SequenceNumber last_seqno; std::unique_ptr log_writer; PrepareManifest(&column_families, &last_seqno, &log_writer); - Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); EXPECT_OK(versions_->Recover(column_families, false /* read_only */)); @@ -2838,9 +3412,9 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { cfd_to_drop->Ref(); drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID()); mutex_.Lock(); - s = versions_->LogAndApply(cfd_to_drop, - *cfd_to_drop->GetLatestMutableCFOptions(), - read_options, &drop_cf_edit, &mutex_, nullptr); + s = versions_->LogAndApply( + cfd_to_drop, *cfd_to_drop->GetLatestMutableCFOptions(), read_options, + write_options, &drop_cf_edit, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); @@ -2873,7 +3447,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { SyncPoint::GetInstance()->SetCallBack( "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) { std::vector* tmp_edits = - reinterpret_cast*>(arg); + static_cast*>(arg); EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size()); for (const auto e : *tmp_edits) { bool found = false; @@ -2890,7 +3464,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { SyncPoint::GetInstance()->EnableProcessing(); mutex_.Lock(); s = versions_->LogAndApply(cfds, mutable_cf_options_list, read_options, - edit_lists, &mutex_, nullptr); + write_options, edit_lists, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); ASSERT_EQ(1, called); @@ -2924,7 +3498,7 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase, log_writer->reset(new log::Writer(std::move(file_writer), 0, true)); std::string record; ASSERT_TRUE(new_db.EncodeTo(&record)); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); // Create new column family VersionEdit new_cf; @@ -2934,7 +3508,7 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase, new_cf.SetNextFile(2); record.clear(); ASSERT_TRUE(new_cf.EncodeTo(&record)); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); } @@ -2948,8 +3522,8 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase, TEST_F(EmptyDefaultCfNewManifest, Recover) { PrepareManifest(nullptr, nullptr, &log_writer_); log_writer_.reset(); - Status s = - SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + /* dir_contains_current_file */ nullptr); ASSERT_OK(s); std::string manifest_path; VerifyManifest(&manifest_path); @@ -2980,7 +3554,7 @@ class VersionSetTestEmptyDb assert(nullptr != log_writer); VersionEdit new_db; if (db_options_.write_dbid_to_manifest) { - ASSERT_OK(SetIdentityFile(env_, dbname_)); + ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_)); DBOptions tmp_db_options; tmp_db_options.env = env_; std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); @@ -2999,7 +3573,7 @@ class VersionSetTestEmptyDb log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); std::string record; new_db.EncodeTo(&record); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); } } @@ -3013,8 +3587,8 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) { db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); PrepareManifest(nullptr, nullptr, &log_writer_); log_writer_.reset(); - Status s = - SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + /* dir_contains_current_file */ nullptr); ASSERT_OK(s); std::string manifest_path; @@ -3054,11 +3628,12 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) { { std::string record; new_cf1.EncodeTo(&record); - s = log_writer_->AddRecord(record); + s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + /* dir_contains_current_file */ nullptr); ASSERT_OK(s); std::string manifest_path; @@ -3101,11 +3676,12 @@ TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) { new_cf.SetColumnFamily(cf_id++); std::string record; ASSERT_TRUE(new_cf.EncodeTo(&record)); - s = log_writer_->AddRecord(record); + s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + /* dir_contains_current_file */ nullptr); ASSERT_OK(s); std::string manifest_path; @@ -3148,7 +3724,7 @@ TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) { new_cf.SetColumnFamily(cf_id++); std::string record; ASSERT_TRUE(new_cf.EncodeTo(&record)); - s = log_writer_->AddRecord(record); + s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } { @@ -3159,11 +3735,12 @@ TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) { tmp_edit.SetLastSequence(0); std::string record; ASSERT_TRUE(tmp_edit.EncodeTo(&record)); - s = log_writer_->AddRecord(record); + s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + /* dir_contains_current_file */ nullptr); ASSERT_OK(s); std::string manifest_path; @@ -3206,7 +3783,7 @@ TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) { new_cf.SetColumnFamily(cf_id++); std::string record; ASSERT_TRUE(new_cf.EncodeTo(&record)); - s = log_writer_->AddRecord(record); + s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } { @@ -3216,11 +3793,12 @@ TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) { tmp_edit.SetLastSequence(0); std::string record; ASSERT_TRUE(tmp_edit.EncodeTo(&record)); - s = log_writer_->AddRecord(record); + s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } log_writer_.reset(); - s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, + /* dir_contains_current_file */ nullptr); ASSERT_OK(s); std::string manifest_path; @@ -3288,9 +3866,6 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, public: VersionSetTestMissingFiles() : VersionSetTestBase("version_set_test_missing_files"), - block_based_table_options_(), - table_factory_(std::make_shared( - block_based_table_options_)), internal_comparator_( std::make_shared(options_.comparator)) {} @@ -3321,7 +3896,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, { std::string record; ASSERT_TRUE(new_db.EncodeTo(&record)); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); } const std::vector cf_names = { @@ -3339,7 +3914,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, new_cf.SetColumnFamily(cf_id); std::string record; ASSERT_TRUE(new_cf.EncodeTo(&record)); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); VersionEdit cf_files; @@ -3347,7 +3922,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, cf_files.SetLogNumber(0); record.clear(); ASSERT_TRUE(cf_files.EncodeTo(&record)); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); ++cf_id; } @@ -3358,71 +3933,12 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, edit.SetLastSequence(seq); std::string record; ASSERT_TRUE(edit.EncodeTo(&record)); - s = (*log_writer)->AddRecord(record); + s = (*log_writer)->AddRecord(WriteOptions(), record); ASSERT_OK(s); } *last_seqno = seq + 1; } - struct SstInfo { - uint64_t file_number; - std::string column_family; - std::string key; // the only key - int level = 0; - uint64_t epoch_number; - SstInfo(uint64_t file_num, const std::string& cf_name, - const std::string& _key, - uint64_t _epoch_number = kUnknownEpochNumber) - : SstInfo(file_num, cf_name, _key, 0, _epoch_number) {} - SstInfo(uint64_t file_num, const std::string& cf_name, - const std::string& _key, int lvl, - uint64_t _epoch_number = kUnknownEpochNumber) - : file_number(file_num), - column_family(cf_name), - key(_key), - level(lvl), - epoch_number(_epoch_number) {} - }; - - // Create dummy sst, return their metadata. Note that only file name and size - // are used. - void CreateDummyTableFiles(const std::vector& file_infos, - std::vector* file_metas) { - assert(file_metas != nullptr); - for (const auto& info : file_infos) { - uint64_t file_num = info.file_number; - std::string fname = MakeTableFileName(dbname_, file_num); - std::unique_ptr file; - Status s = fs_->NewWritableFile(fname, FileOptions(), &file, nullptr); - ASSERT_OK(s); - std::unique_ptr fwriter(new WritableFileWriter( - std::move(file), fname, FileOptions(), env_->GetSystemClock().get())); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; - - std::unique_ptr builder(table_factory_->NewTableBuilder( - TableBuilderOptions( - immutable_options_, mutable_cf_options_, *internal_comparator_, - &int_tbl_prop_collector_factories, kNoCompression, - CompressionOptions(), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, - info.column_family, info.level), - fwriter.get())); - InternalKey ikey(info.key, 0, ValueType::kTypeValue); - builder->Add(ikey.Encode(), "value"); - ASSERT_OK(builder->Finish()); - ASSERT_OK(fwriter->Flush()); - uint64_t file_size = 0; - s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr); - ASSERT_OK(s); - ASSERT_NE(0, file_size); - file_metas->emplace_back( - file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false, - Temperature::kUnknown, 0, 0, 0, info.epoch_number, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, - 0, 0, /* user_defined_timestamps_persisted */ true); - } - } - // This method updates last_sequence_. void WriteFileAdditionAndDeletionToManifest( uint32_t cf, const std::vector>& added_files, @@ -3442,12 +3958,10 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, assert(log_writer_.get() != nullptr); std::string record; ASSERT_TRUE(edit.EncodeTo(&record, 0 /* ts_sz */)); - Status s = log_writer_->AddRecord(record); + Status s = log_writer_->AddRecord(WriteOptions(), record); ASSERT_OK(s); } - BlockBasedTableOptions block_based_table_options_; - std::shared_ptr table_factory_; std::shared_ptr internal_comparator_; std::vector column_families_; SequenceNumber last_seqno_; @@ -3487,7 +4001,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { WriteFileAdditionAndDeletionToManifest( /*cf=*/0, std::vector>(), deleted_files); log_writer_.reset(); - Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3545,7 +4059,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { WriteFileAdditionAndDeletionToManifest( /*cf=*/0, added_files, std::vector>()); log_writer_.reset(); - Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3599,7 +4113,7 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) { WriteFileAdditionAndDeletionToManifest( /*cf=*/0, std::vector>(), deleted_files); log_writer_.reset(); - Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); std::string manifest_path; VerifyManifest(&manifest_path); @@ -3662,7 +4176,7 @@ class ChargeFileMetadataTestWithParam : public ChargeFileMetadataTest, public testing::WithParamInterface { public: - ChargeFileMetadataTestWithParam() {} + ChargeFileMetadataTestWithParam() = default; }; INSTANTIATE_TEST_CASE_P( diff --git a/db/version_util.h b/db/version_util.h index acb27749b1b..e499b9e2ed1 100644 --- a/db/version_util.h +++ b/db/version_util.h @@ -27,7 +27,8 @@ class OfflineManifestWriter { /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", options.daily_offpeak_time_utc, - /*error_handler=*/nullptr) {} + /*error_handler=*/nullptr, + /*read_only=*/false) {} Status Recover(const std::vector& column_families) { return versions_.Recover(column_families, /*read_only*/ false, @@ -35,15 +36,17 @@ class OfflineManifestWriter { /*no_error_if_files_missing*/ true); } - Status LogAndApply(const ReadOptions& read_options, ColumnFamilyData* cfd, + Status LogAndApply(const ReadOptions& read_options, + const WriteOptions& write_options, ColumnFamilyData* cfd, VersionEdit* edit, FSDirectory* dir_contains_current_file) { // Use `mutex` to imitate a locked DB mutex when calling `LogAndApply()`. InstrumentedMutex mutex; mutex.Lock(); - Status s = versions_.LogAndApply( - cfd, *cfd->GetLatestMutableCFOptions(), read_options, edit, &mutex, - dir_contains_current_file, false /* new_descriptor_log */); + Status s = versions_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, write_options, edit, &mutex, + dir_contains_current_file, + false /* new_descriptor_log */); mutex.Unlock(); return s; } diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 2b384e7d208..1f8190b93a9 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -158,11 +158,14 @@ void WalManager::PurgeObsoleteWALFiles() { ? std::min(kDefaultIntervalToDeleteObsoleteWAL, std::max(uint64_t{1}, db_options_.WAL_ttl_seconds / 2)) : kDefaultIntervalToDeleteObsoleteWAL; - if (purge_wal_files_last_run_ + time_to_check > now_seconds) { - return; - } - - purge_wal_files_last_run_ = now_seconds; + uint64_t old_last_run_time = purge_wal_files_last_run_.LoadRelaxed(); + do { + if (old_last_run_time + time_to_check > now_seconds) { + // last run is recent enough, no need to purge + return; + } + } while (!purge_wal_files_last_run_.CasWeakRelaxed( + /*expected=*/old_last_run_time, /*desired=*/now_seconds)); std::string archival_dir = ArchivalDirectory(wal_dir_); std::vector files; @@ -334,8 +337,8 @@ Status WalManager::GetSortedWalsOfType(const std::string& path, return s; } - log_files.push_back(std::unique_ptr( - new LogFileImpl(number, log_type, sequence, size_bytes))); + log_files.emplace_back( + new LogFileImpl(number, log_type, sequence, size_bytes)); } } std::sort( diff --git a/db/wal_manager.h b/db/wal_manager.h index ab79bf00239..d8acba8afa3 100644 --- a/db/wal_manager.h +++ b/db/wal_manager.h @@ -25,6 +25,7 @@ #include "rocksdb/status.h" #include "rocksdb/transaction_log.h" #include "rocksdb/types.h" +#include "util/atomic.h" namespace ROCKSDB_NAMESPACE { @@ -118,7 +119,7 @@ class WalManager { port::Mutex read_first_record_cache_mutex_; // last time when PurgeObsoleteWALFiles ran. - uint64_t purge_wal_files_last_run_; + RelaxedAtomic purge_wal_files_last_run_; bool seq_per_batch_; diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index 0ead57ae811..3be19cb3a41 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -55,7 +55,7 @@ class WalManagerTest : public testing::Test { &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr)); + /*error_handler=*/nullptr, /*read_only=*/false)); wal_manager_.reset( new WalManager(db_options_, env_options_, nullptr /*IOTracer*/)); @@ -73,8 +73,8 @@ class WalManagerTest : public testing::Test { WriteBatch batch; ASSERT_OK(batch.Put(key, value)); WriteBatchInternal::SetSequence(&batch, seq); - ASSERT_OK( - current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch))); + ASSERT_OK(current_log_writer_->AddRecord( + WriteOptions(), WriteBatchInternal::Contents(&batch))); versions_->SetLastAllocatedSequence(seq); versions_->SetLastPublishedSequence(seq); versions_->SetLastSequence(seq); @@ -146,7 +146,8 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) { WriteBatch batch; ASSERT_OK(batch.Put("foo", "bar")); WriteBatchInternal::SetSequence(&batch, 10); - ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch))); + ASSERT_OK( + writer.AddRecord(WriteOptions(), WriteBatchInternal::Contents(&batch))); // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here. // Waiting for lei to finish with db_test diff --git a/db/wide/db_wide_basic_test.cc b/db/wide/db_wide_basic_test.cc index 2280a3ed2e9..15d2fdff7f9 100644 --- a/db/wide/db_wide_basic_test.cc +++ b/db/wide/db_wide_basic_test.cc @@ -94,7 +94,7 @@ TEST_F(DBWideBasicTest, PutEntity) { std::array statuses; db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, - &keys[0], &values[0], &statuses[0]); + keys.data(), values.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(values[0], first_value_of_default_column); @@ -114,7 +114,7 @@ TEST_F(DBWideBasicTest, PutEntity) { std::array statuses; db_->MultiGetEntity(ReadOptions(), db_->DefaultColumnFamily(), num_keys, - &keys[0], &results[0], &statuses[0]); + keys.data(), results.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(results[0].columns(), first_columns); @@ -398,8 +398,8 @@ TEST_F(DBWideBasicTest, MultiCFMultiGetEntity) { std::array results; std::array statuses; - db_->MultiGetEntity(ReadOptions(), num_keys, &column_families[0], &keys[0], - &results[0], &statuses[0]); + db_->MultiGetEntity(ReadOptions(), num_keys, column_families.data(), + keys.data(), results.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(results[0].columns(), first_columns); @@ -642,7 +642,7 @@ TEST_F(DBWideBasicTest, MergePlainKeyValue) { std::array statuses; db_->MultiGetEntity(ReadOptions(), db_->DefaultColumnFamily(), num_keys, - &keys[0], &results[0], &statuses[0]); + keys.data(), results.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(results[0].columns(), expected_first_columns); @@ -822,7 +822,7 @@ TEST_F(DBWideBasicTest, MergeEntity) { std::array statuses; db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, - &keys[0], &values[0], &statuses[0]); + keys.data(), values.data(), statuses.data()); ASSERT_EQ(values[0], first_expected_default); ASSERT_OK(statuses[0]); @@ -839,7 +839,7 @@ TEST_F(DBWideBasicTest, MergeEntity) { std::array statuses; db_->MultiGetEntity(ReadOptions(), db_->DefaultColumnFamily(), num_keys, - &keys[0], &results[0], &statuses[0]); + keys.data(), results.data(), statuses.data()); ASSERT_OK(statuses[0]); ASSERT_EQ(results[0].columns(), first_expected_columns); @@ -900,7 +900,7 @@ TEST_F(DBWideBasicTest, MergeEntity) { int number_of_operands = 0; ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), - first_key, &merge_operands[0], + first_key, merge_operands.data(), &get_merge_opts, &number_of_operands)); ASSERT_EQ(number_of_operands, num_merge_operands); @@ -913,7 +913,7 @@ TEST_F(DBWideBasicTest, MergeEntity) { int number_of_operands = 0; ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), - second_key, &merge_operands[0], + second_key, merge_operands.data(), &get_merge_opts, &number_of_operands)); ASSERT_EQ(number_of_operands, num_merge_operands); @@ -933,7 +933,7 @@ TEST_F(DBWideBasicTest, MergeEntity) { int number_of_operands = 0; ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), - first_key, &merge_operands[0], + first_key, merge_operands.data(), &get_merge_opts, &number_of_operands)); ASSERT_EQ(number_of_operands, num_merge_operands); @@ -945,7 +945,7 @@ TEST_F(DBWideBasicTest, MergeEntity) { int number_of_operands = 0; ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), - second_key, &merge_operands[0], + second_key, merge_operands.data(), &get_merge_opts, &number_of_operands)); ASSERT_EQ(number_of_operands, num_merge_operands); diff --git a/db/wide/wide_columns.cc b/db/wide/wide_columns.cc index 186be7f854c..61f56d715f6 100644 --- a/db/wide/wide_columns.cc +++ b/db/wide/wide_columns.cc @@ -12,6 +12,7 @@ namespace ROCKSDB_NAMESPACE { const Slice kDefaultWideColumnName; const WideColumns kNoWideColumns; +const AttributeGroups kNoAttributeGroups; Status PinnableWideColumns::CreateIndexForWideColumns() { Slice value_copy = value_; diff --git a/db/write_batch.cc b/db/write_batch.cc index f8583f478ed..ff17e891da6 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -90,6 +90,7 @@ enum ContentFlags : uint32_t { HAS_BLOB_INDEX = 1 << 10, HAS_BEGIN_UNPREPARE = 1 << 11, HAS_PUT_ENTITY = 1 << 12, + HAS_TIMED_PUT = 1 << 13, }; struct BatchContentClassifier : public WriteBatch::Handler { @@ -100,6 +101,11 @@ struct BatchContentClassifier : public WriteBatch::Handler { return Status::OK(); } + Status TimedPutCF(uint32_t, const Slice&, const Slice&, uint64_t) override { + content_flags |= ContentFlags::HAS_TIMED_PUT; + return Status::OK(); + } + Status PutEntityCF(uint32_t /* column_family_id */, const Slice& /* key */, const Slice& /* entity */) override { content_flags |= ContentFlags::HAS_PUT_ENTITY; @@ -233,9 +239,9 @@ WriteBatch& WriteBatch::operator=(WriteBatch&& src) { return *this; } -WriteBatch::~WriteBatch() {} +WriteBatch::~WriteBatch() = default; -WriteBatch::Handler::~Handler() {} +WriteBatch::Handler::~Handler() = default; void WriteBatch::Handler::LogData(const Slice& /*blob*/) { // If the user has not specified something to do with blobs, then we ignore @@ -305,6 +311,10 @@ bool WriteBatch::HasPut() const { return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0; } +bool WriteBatch::HasTimedPut() const { + return (ComputeContentFlags() & ContentFlags::HAS_TIMED_PUT) != 0; +} + bool WriteBatch::HasPutEntity() const { return (ComputeContentFlags() & ContentFlags::HAS_PUT_ENTITY) != 0; } @@ -360,7 +370,8 @@ bool WriteBatch::HasRollback() const { Status ReadRecordFromWriteBatch(Slice* input, char* tag, uint32_t* column_family, Slice* key, - Slice* value, Slice* blob, Slice* xid) { + Slice* value, Slice* blob, Slice* xid, + uint64_t* write_unix_time) { assert(key != nullptr && value != nullptr); *tag = (*input)[0]; input->remove_prefix(1); @@ -468,6 +479,18 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag, return Status::Corruption("bad WriteBatch PutEntity"); } break; + case kTypeColumnFamilyValuePreferredSeqno: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch TimedPut"); + } + FALLTHROUGH_INTENDED; + case kTypeValuePreferredSeqno: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value) || + !GetFixed64(input, write_unix_time)) { + return Status::Corruption("bad WriteBatch TimedPut"); + } + break; default: return Status::Corruption("unknown WriteBatch tag"); } @@ -495,6 +518,7 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb, (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size()); Slice key, value, blob, xid; + uint64_t write_unix_time = 0; // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as // the batch boundary symbols otherwise we would mis-count the number of @@ -519,7 +543,7 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb, column_family = 0; // default s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value, - &blob, &xid); + &blob, &xid, &write_unix_time); if (!s.ok()) { return s; } @@ -705,6 +729,16 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb, ++found; } break; + case kTypeValuePreferredSeqno: + case kTypeColumnFamilyValuePreferredSeqno: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_TIMED_PUT)); + s = handler->TimedPutCF(column_family, key, value, write_unix_time); + if (LIKELY(s.ok())) { + empty_batch = false; + ++found; + } + break; default: return Status::Corruption("unknown WriteBatch tag"); } @@ -741,7 +775,7 @@ SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { } void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { - EncodeFixed64(&b->rep_[0], seq); + EncodeFixed64(b->rep_.data(), seq); } size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) { @@ -828,6 +862,49 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, return save.commit(); } +Status WriteBatchInternal::TimedPut(WriteBatch* b, uint32_t column_family_id, + const Slice& key, const Slice& value, + uint64_t write_unix_time) { + if (key.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("key is too large"); + } + if (value.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("value is too large"); + } + if (std::numeric_limits::max() == write_unix_time) { + return WriteBatchInternal::Put(b, column_family_id, key, value); + } + LocalSavePoint save(b); + + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeValuePreferredSeqno)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyValuePreferredSeqno)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); + // For a kTypeValuePreferredSeqno entry, its write time is encoded separately + // from value in an encoded WriteBatch. They are packed into one value Slice + // once it's written to the database. + PutFixed64(&b->rep_, write_unix_time); + + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_TIMED_PUT, + std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in other internal functions for why we don't need to + // differentiate between `kTypeValuePreferredSeqno` and + // `kTypeColumnFamilyValuePreferredSeqno` here. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, value, kTypeValuePreferredSeqno) + .ProtectC(column_family_id)); + } + return save.commit(); +} + Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { size_t ts_sz = 0; @@ -854,6 +931,26 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, SliceParts(&value, 1)); } +Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, uint64_t write_unix_time) { + size_t ts_sz = 0; + uint32_t cf_id = 0; + Status s; + + std::tie(s, cf_id, ts_sz) = + WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this, + column_family); + + if (!s.ok()) { + return s; + } else if (ts_sz != 0) { + return Status::NotSupported( + "TimedPut is not supported in combination with user-defined " + "timestamps."); + } + return WriteBatchInternal::TimedPut(this, cf_id, key, value, write_unix_time); +} + Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, const Slice& ts, const Slice& value) { const Status s = CheckColumnFamilyTimestampSize(column_family, ts); @@ -1682,6 +1779,7 @@ Status WriteBatch::VerifyChecksum() const { Slice input(rep_.data() + WriteBatchInternal::kHeader, rep_.size() - WriteBatchInternal::kHeader); Slice key, value, blob, xid; + uint64_t unix_write_time = 0; char tag = 0; uint32_t column_family = 0; // default Status s; @@ -1694,7 +1792,7 @@ Status WriteBatch::VerifyChecksum() const { value.clear(); column_family = 0; s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value, - &blob, &xid); + &blob, &xid, &unix_write_time); if (!s.ok()) { return s; } @@ -1744,6 +1842,10 @@ Status WriteBatch::VerifyChecksum() const { case kTypeWideColumnEntity: tag = kTypeWideColumnEntity; break; + case kTypeColumnFamilyValuePreferredSeqno: + case kTypeValuePreferredSeqno: + tag = kTypeValuePreferredSeqno; + break; default: return Status::Corruption( "unknown WriteBatch tag", @@ -1856,7 +1958,9 @@ class MemTableInserter : public WriteBatch::Handler { } void DecrementProtectionInfoIdxForTryAgain() { - if (prot_info_ != nullptr) --prot_info_idx_; + if (prot_info_ != nullptr) { + --prot_info_idx_; + } } void ResetProtectionInfo() { @@ -2062,7 +2166,7 @@ class MemTableInserter : public WriteBatch::Handler { // key not found in memtable. Do sst get, update, add SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ropts; // it's going to be overwritten for sure, so no point caching data block // containing the old version @@ -2183,6 +2287,34 @@ class MemTableInserter : public WriteBatch::Handler { return ret_status; } + Status TimedPutCF(uint32_t column_family_id, const Slice& key, + const Slice& value, uint64_t unix_write_time) override { + const auto* kv_prot_info = NextProtectionInfo(); + Status ret_status; + std::string value_buf; + Slice packed_value = + PackValueAndWriteTime(value, unix_write_time, &value_buf); + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = PutCFImpl(column_family_id, key, packed_value, + kTypeValuePreferredSeqno, &mem_kv_prot_info); + } else { + ret_status = + PutCFImpl(column_family_id, key, packed_value, + kTypeValuePreferredSeqno, nullptr /* kv_prot_info */); + } + + // TODO: this assumes that if TryAgain status is returned to the caller, + // The operation is actually tried again. The proper way to do this is to + // pass a `try_again` parameter to the operation itself and decrement + // prot_info_idx_ based on that. + if (UNLIKELY(ret_status.IsTryAgain())) { + DecrementProtectionInfoIdxForTryAgain(); + } + return ret_status; + } + Status PutEntityCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { const auto* kv_prot_info = NextProtectionInfo(); @@ -2491,8 +2623,10 @@ class MemTableInserter : public WriteBatch::Handler { LookupKey lkey(key, sequence_); // Count the number of successive merges at the head - // of the key in the memtable - size_t num_merges = mem->CountSuccessiveMergeEntries(lkey); + // of the key in the memtable. Limit the count to the threshold for + // triggering merge to prevent unnecessary counting overhead. + size_t num_merges = mem->CountSuccessiveMergeEntries( + lkey, moptions->max_successive_merges /* limit */); if (num_merges >= moptions->max_successive_merges) { perform_merge = true; @@ -2509,8 +2643,13 @@ class MemTableInserter : public WriteBatch::Handler { SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions read_options; + if (!moptions->strict_max_successive_merges) { + // Blocking the write path with read I/O is typically unacceptable, so + // only do this merge when the operands are all found in memory. + read_options.read_tier = kBlockCacheTier; + } read_options.snapshot = &read_from_snapshot; auto cf_handle = cf_mems_->GetColumnFamilyHandle(); @@ -2543,9 +2682,8 @@ class MemTableInserter : public WriteBatch::Handler { WideColumnsHelper::GetDefaultColumn(columns), {value}, moptions->info_log, moptions->statistics, SystemClock::Default().get(), - /* update_num_ops_stats */ false, &new_value, - /* result_operand */ nullptr, &new_value_type, - /* op_failure_scope */ nullptr); + /* update_num_ops_stats */ false, /* op_failure_scope */ nullptr, + &new_value, /* result_operand */ nullptr, &new_value_type); } else { // `op_failure_scope` (an output parameter) is not provided (set to // nullptr) since a failure must be propagated regardless of its @@ -2554,9 +2692,8 @@ class MemTableInserter : public WriteBatch::Handler { merge_operator, key, MergeHelper::kWideBaseValue, columns, {value}, moptions->info_log, moptions->statistics, SystemClock::Default().get(), - /* update_num_ops_stats */ false, &new_value, - /* result_operand */ nullptr, &new_value_type, - /* op_failure_scope */ nullptr); + /* update_num_ops_stats */ false, /* op_failure_scope */ nullptr, + &new_value, /* result_operand */ nullptr, &new_value_type); } if (!merge_status.ok()) { @@ -3018,12 +3155,17 @@ class ProtectionInfoUpdater : public WriteBatch::Handler { explicit ProtectionInfoUpdater(WriteBatch::ProtectionInfo* prot_info) : prot_info_(prot_info) {} - ~ProtectionInfoUpdater() override {} + ~ProtectionInfoUpdater() override = default; Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override { return UpdateProtInfo(cf, key, val, kTypeValue); } + Status TimedPutCF(uint32_t cf, const Slice& key, const Slice& val, + uint64_t /*unix_write_time*/) override { + return UpdateProtInfo(cf, key, val, kTypeValuePreferredSeqno); + } + Status PutEntityCF(uint32_t cf, const Slice& key, const Slice& entity) override { return UpdateProtInfo(cf, key, entity, kTypeWideColumnEntity); diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index ba0b6f24040..e839ccc1545 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -87,6 +87,10 @@ class WriteBatchInternal { static Status Put(WriteBatch* batch, uint32_t column_family_id, const SliceParts& key, const SliceParts& value); + static Status TimedPut(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value, + uint64_t unix_write_time); + static Status PutEntity(WriteBatch* batch, uint32_t column_family_id, const Slice& key, const WideColumns& columns); diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 00faea4ce46..8db8c32a0a8 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -21,7 +21,6 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/write_buffer_manager.h" -#include "table/scoped_arena_iterator.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/string_util.h" @@ -48,18 +47,20 @@ static std::string PrintContents(WriteBatch* b, WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr); uint32_t count = 0; int put_count = 0; + int timed_put_count = 0; int delete_count = 0; int single_delete_count = 0; int delete_range_count = 0; int merge_count = 0; for (int i = 0; i < 2; ++i) { Arena arena; - ScopedArenaIterator arena_iter_guard; + ScopedArenaPtr arena_iter_guard; std::unique_ptr iter_guard; InternalIterator* iter; if (i == 0) { - iter = mem->NewIterator(ReadOptions(), &arena); - arena_iter_guard.set(iter); + iter = mem->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr, + &arena); + arena_iter_guard.reset(iter); } else { iter = mem->NewRangeTombstoneIterator(ReadOptions(), kMaxSequenceNumber /* read_seq */, @@ -116,6 +117,20 @@ static std::string PrintContents(WriteBatch* b, count++; merge_count++; break; + case kTypeValuePreferredSeqno: { + state.append("TimedPut("); + state.append(ikey.user_key.ToString()); + state.append(", "); + auto [unpacked_value, unix_write_time] = + ParsePackedValueWithWriteTime(iter->value()); + state.append(unpacked_value.ToString()); + state.append(", "); + state.append(std::to_string(unix_write_time)); + state.append(")"); + count++; + timed_put_count++; + break; + } default: assert(false); break; @@ -127,6 +142,7 @@ static std::string PrintContents(WriteBatch* b, } if (s.ok()) { EXPECT_EQ(b->HasPut(), put_count > 0); + EXPECT_EQ(b->HasTimedPut(), timed_put_count > 0); EXPECT_EQ(b->HasDelete(), delete_count > 0); EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0); EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0); @@ -278,6 +294,18 @@ struct TestHandler : public WriteBatch::Handler { } return Status::OK(); } + Status TimedPutCF(uint32_t column_family_id, const Slice& key, + const Slice& value, uint64_t unix_write_time) override { + if (column_family_id == 0) { + seen += "TimedPut(" + key.ToString() + ", " + value.ToString() + ", " + + std::to_string(unix_write_time) + ")"; + } else { + seen += "TimedPutCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ", " + value.ToString() + ", " + + std::to_string(unix_write_time) + ")"; + } + return Status::OK(); + } Status PutEntityCF(uint32_t column_family_id, const Slice& key, const Slice& entity) override { std::ostringstream oss; @@ -374,6 +402,24 @@ TEST_F(WriteBatchTest, PutNotImplemented) { ASSERT_OK(batch.Iterate(&handler)); } +TEST_F(WriteBatchTest, TimedPutNotImplemented) { + WriteBatch batch; + ASSERT_OK( + batch.TimedPut(0, Slice("k1"), Slice("v1"), /*write_unix_time=*/30)); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ("TimedPut(k1, v1, 30)@0", PrintContents(&batch)); + + WriteBatch::Handler handler; + ASSERT_TRUE(batch.Iterate(&handler).IsInvalidArgument()); + + batch.Clear(); + ASSERT_OK( + batch.TimedPut(0, Slice("k1"), Slice("v1"), + /*write_unix_time=*/std::numeric_limits::max())); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch)); +} + TEST_F(WriteBatchTest, DeleteNotImplemented) { WriteBatch batch; ASSERT_OK(batch.Delete(Slice("k2"))); @@ -770,9 +816,8 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) { ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three"))); ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom"))); - // TODO(yuzhangyu): implement this. - ASSERT_TRUE( - batch.TimedPut(&zero, Slice("foo"), Slice("bar"), 0u).IsNotSupported()); + ASSERT_OK(batch.TimedPut(&zero, Slice("foo"), Slice("bar"), + /*write_unix_time*/ 0u)); TestHandler handler; ASSERT_OK(batch.Iterate(&handler)); @@ -785,7 +830,8 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) { "DeleteRangeCF(2, 3foo, 4foo)" "MergeCF(3, threethree, 3three)" "Put(foo, bar)" - "Merge(omom, nom)", + "Merge(omom, nom)" + "TimedPut(foo, bar, 0)", handler.seen); } diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index ef8e6c98d3c..97e8c5379f9 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -66,7 +66,7 @@ class MockWriteCallback : public WriteCallback { bool allow_batching_ = false; std::atomic was_called_{false}; - MockWriteCallback() {} + MockWriteCallback() = default; MockWriteCallback(const MockWriteCallback& other) { should_fail_ = other.should_fail_; @@ -113,7 +113,7 @@ TEST_P(WriteCallbackPTest, WriteWithCallbackTest) { WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; } void Put(const string& key, const string& val) { - kvs_.push_back(std::make_pair(key, val)); + kvs_.emplace_back(key, val); ASSERT_OK(write_batch_.Put(key, val)); } @@ -178,8 +178,7 @@ TEST_P(WriteCallbackPTest, WriteWithCallbackTest) { DBOptions db_options(options); ColumnFamilyOptions cf_options(options); std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); std::vector handles; auto open_s = DBImpl::Open(db_options, dbname, column_families, &handles, &db, seq_per_batch_, true /* batch_per_txn */); @@ -223,7 +222,7 @@ TEST_P(WriteCallbackPTest, WriteWithCallbackTest) { is_last = (cur_threads_linked == write_group.size() - 1); // check my state - auto* writer = reinterpret_cast(arg); + auto* writer = static_cast(arg); if (is_leader) { ASSERT_TRUE(writer->state == @@ -253,7 +252,7 @@ TEST_P(WriteCallbackPTest, WriteWithCallbackTest) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) { // check my state - auto* writer = reinterpret_cast(arg); + auto* writer = static_cast(arg); if (!allow_batching_) { // no batching so everyone should be a leader diff --git a/db/write_stall_stats.h b/db/write_stall_stats.h index 6394abb0a82..624e4fd68e7 100644 --- a/db/write_stall_stats.h +++ b/db/write_stall_stats.h @@ -11,11 +11,11 @@ #include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { -extern const std::string& InvalidWriteStallHyphenString(); +const std::string& InvalidWriteStallHyphenString(); -extern const std::string& WriteStallCauseToHyphenString(WriteStallCause cause); +const std::string& WriteStallCauseToHyphenString(WriteStallCause cause); -extern const std::string& WriteStallConditionToHyphenString( +const std::string& WriteStallConditionToHyphenString( WriteStallCondition condition); // REQUIRES: @@ -23,7 +23,7 @@ extern const std::string& WriteStallConditionToHyphenString( // // REQUIRES: // `condition` != `WriteStallCondition::kNormal` -extern InternalStats::InternalCFStatsType InternalCFStat( +InternalStats::InternalCFStatsType InternalCFStat( WriteStallCause cause, WriteStallCondition condition); // REQUIRES: @@ -31,11 +31,11 @@ extern InternalStats::InternalCFStatsType InternalCFStat( // // REQUIRES: // `condition` != `WriteStallCondition::kNormal` -extern InternalStats::InternalDBStatsType InternalDBStat( +InternalStats::InternalDBStatsType InternalDBStat( WriteStallCause cause, WriteStallCondition condition); -extern bool isCFScopeWriteStallCause(WriteStallCause cause); -extern bool isDBScopeWriteStallCause(WriteStallCause cause); +bool isCFScopeWriteStallCause(WriteStallCause cause); +bool isDBScopeWriteStallCause(WriteStallCause cause); constexpr uint32_t kNumCFScopeWriteStallCauses = static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax) - diff --git a/db/write_thread.cc b/db/write_thread.cc index 79870077523..39f13c31875 100644 --- a/db/write_thread.cc +++ b/db/write_thread.cc @@ -83,7 +83,7 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask, // This is below the fast path, so that the stat is zero when all writes are // from the same thread. - PERF_TIMER_GUARD(write_thread_wait_nanos); + PERF_TIMER_FOR_WAIT_GUARD(write_thread_wait_nanos); // If we're only going to end up waiting a short period of time, // it can be a lot more efficient to call std::this_thread::yield() @@ -464,62 +464,101 @@ size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader, // so we have already received our MarkJoined). CreateMissingNewerLinks(newest_writer); + // This comment illustrates how the rest of the function works using an + // example. Notation: + // + // - Items are `Writer`s + // - Items prefixed by "@" have been included in `write_group` + // - Items prefixed by "*" have compatible options with `leader`, but have not + // been included in `write_group` yet + // - Items after several spaces are in `r_list`. These have incompatible + // options with `leader` and are temporarily separated from the main list. + // + // Each line below depicts the state of the linked lists at the beginning of + // an iteration of the while-loop. + // + // @leader, n1, *n2, n3, *newest_writer + // @leader, *n2, n3, *newest_writer, n1 + // @leader, @n2, n3, *newest_writer, n1 + // + // After the while-loop, the `r_list` is grafted back onto the main list. + // + // case A: no new `Writer`s arrived + // @leader, @n2, @newest_writer, n1, n3 + // @leader, @n2, @newest_writer, n1, n3 + // + // case B: a new `Writer` (n4) arrived + // @leader, @n2, @newest_writer, n4 n1, n3 + // @leader, @n2, @newest_writer, n1, n3, n4 + // Tricky. Iteration start (leader) is exclusive and finish // (newest_writer) is inclusive. Iteration goes from old to new. Writer* w = leader; + // write_group end + Writer* we = leader; + // declare r_list + Writer* rb = nullptr; + Writer* re = nullptr; + while (w != newest_writer) { assert(w->link_newer); w = w->link_newer; - if (w->sync && !leader->sync) { - // Do not include a sync write into a batch handled by a non-sync write. - break; - } - - if (w->no_slowdown != leader->no_slowdown) { - // Do not mix writes that are ok with delays with the ones that - // request fail on delays. - break; - } - - if (w->disable_wal != leader->disable_wal) { - // Do not mix writes that enable WAL with the ones whose - // WAL disabled. - break; - } - - if (w->protection_bytes_per_key != leader->protection_bytes_per_key) { - // Do not mix writes with different levels of integrity protection. - break; - } - - if (w->rate_limiter_priority != leader->rate_limiter_priority) { - // Do not mix writes with different rate limiter priorities. - break; - } - - if (w->batch == nullptr) { - // Do not include those writes with nullptr batch. Those are not writes, - // those are something else. They want to be alone - break; - } - - if (w->callback != nullptr && !w->callback->AllowWriteBatching()) { - // don't batch writes that don't want to be batched - break; + if ((w->sync && !leader->sync) || + // Do not include a sync write into a batch handled by a non-sync write. + (w->no_slowdown != leader->no_slowdown) || + // Do not mix writes that are ok with delays with the ones that request + // fail on delays. + (w->disable_wal != leader->disable_wal) || + // Do not mix writes that enable WAL with the ones whose WAL disabled. + (w->protection_bytes_per_key != leader->protection_bytes_per_key) || + // Do not mix writes with different levels of integrity protection. + (w->rate_limiter_priority != leader->rate_limiter_priority) || + // Do not mix writes with different rate limiter priorities. + (w->batch == nullptr) || + // Do not include those writes with nullptr batch. Those are not writes + // those are something else. They want to be alone + (w->callback != nullptr && !w->callback->AllowWriteBatching()) || + // dont batch writes that don't want to be batched + (size + WriteBatchInternal::ByteSize(w->batch) > max_size) + // Do not make batch too big + ) { + // remove from list + w->link_older->link_newer = w->link_newer; + if (w->link_newer != nullptr) { + w->link_newer->link_older = w->link_older; + } + // insert into r_list + if (re == nullptr) { + rb = re = w; + w->link_older = nullptr; + } else { + w->link_older = re; + re->link_newer = w; + re = w; + } + } else { + // grow up + we = w; + w->write_group = write_group; + size += WriteBatchInternal::ByteSize(w->batch); + write_group->last_writer = w; + write_group->size++; } - - auto batch_size = WriteBatchInternal::ByteSize(w->batch); - if (size + batch_size > max_size) { - // Do not make batch too big - break; + } + // append r_list after write_group end + if (rb != nullptr) { + rb->link_older = we; + re->link_newer = nullptr; + we->link_newer = rb; + if (!newest_writer_.compare_exchange_weak(w, re)) { + while (w->link_older != newest_writer) { + w = w->link_older; + } + w->link_older = re; } - - w->write_group = write_group; - size += batch_size; - write_group->last_writer = w; - write_group->size++; } + TEST_SYNC_POINT_CALLBACK("WriteThread::EnterAsBatchGroupLeader:End", w); return size; } diff --git a/db/write_thread.h b/db/write_thread.h index 6638bbfd914..7a74bd30de4 100644 --- a/db/write_thread.h +++ b/db/write_thread.h @@ -166,6 +166,8 @@ class WriteThread { PreReleaseCallback* _pre_release_callback = nullptr, PostMemTableCallback* _post_memtable_callback = nullptr) : batch(_batch), + // TODO: store a copy of WriteOptions instead of its seperated data + // members sync(write_options.sync), no_slowdown(write_options.no_slowdown), disable_wal(write_options.disableWAL), diff --git a/db_stress_tool/batched_ops_stress.cc b/db_stress_tool/batched_ops_stress.cc index 7fb89b60bbd..25e4d23722e 100644 --- a/db_stress_tool/batched_ops_stress.cc +++ b/db_stress_tool/batched_ops_stress.cc @@ -13,8 +13,8 @@ namespace ROCKSDB_NAMESPACE { class BatchedOpsStressTest : public StressTest { public: - BatchedOpsStressTest() {} - virtual ~BatchedOpsStressTest() {} + BatchedOpsStressTest() = default; + virtual ~BatchedOpsStressTest() = default; bool IsStateTracked() const override { return false; } diff --git a/db_stress_tool/cf_consistency_stress.cc b/db_stress_tool/cf_consistency_stress.cc index a7b0895f37f..da382ae3b8d 100644 --- a/db_stress_tool/cf_consistency_stress.cc +++ b/db_stress_tool/cf_consistency_stress.cc @@ -16,7 +16,7 @@ class CfConsistencyStressTest : public StressTest { public: CfConsistencyStressTest() : batch_id_(0) {} - ~CfConsistencyStressTest() override {} + ~CfConsistencyStressTest() override = default; bool IsStateTracked() const override { return false; } @@ -232,7 +232,7 @@ class CfConsistencyStressTest : public StressTest { } db_->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(), statuses.data()); - for (auto s : statuses) { + for (const auto& s : statuses) { if (s.ok()) { // found case thread->stats.AddGets(1, 1); diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc index c0087dc5c70..27a4299da48 100644 --- a/db_stress_tool/db_stress_common.cc +++ b/db_stress_tool/db_stress_common.cc @@ -92,7 +92,7 @@ int64_t GetOneHotKeyID(double rand_seed, int64_t max_key) { void PoolSizeChangeThread(void* v) { assert(FLAGS_compaction_thread_pool_adjust_interval > 0); - ThreadState* thread = reinterpret_cast(v); + ThreadState* thread = static_cast(v); SharedState* shared = thread->shared; while (true) { @@ -127,7 +127,7 @@ void PoolSizeChangeThread(void* v) { void DbVerificationThread(void* v) { assert(FLAGS_continuous_verification_interval > 0); - auto* thread = reinterpret_cast(v); + auto* thread = static_cast(v); SharedState* shared = thread->shared; StressTest* stress_test = shared->GetStressTest(); assert(stress_test != nullptr); @@ -154,7 +154,7 @@ void DbVerificationThread(void* v) { void CompressedCacheSetCapacityThread(void* v) { assert(FLAGS_compressed_secondary_cache_size > 0 || FLAGS_compressed_secondary_cache_ratio > 0.0); - auto* thread = reinterpret_cast(v); + auto* thread = static_cast(v); SharedState* shared = thread->shared; while (true) { { @@ -200,7 +200,7 @@ void CompressedCacheSetCapacityThread(void* v) { // Lower by upto 50% of usable block cache capacity adjustment = (adjustment * thread->rand.Uniform(50)) / 100; block_cache->SetCapacity(capacity - adjustment); - fprintf(stderr, "New cache capacity = %lu\n", + fprintf(stdout, "New cache capacity = %lu\n", block_cache->GetCapacity()); db_stress_env->SleepForMicroseconds(10 * 1000 * 1000); block_cache->SetCapacity(capacity); @@ -210,10 +210,7 @@ void CompressedCacheSetCapacityThread(void* v) { (double)thread->rand.Uniform( FLAGS_compressed_secondary_cache_ratio * 100) / 100; - if (new_comp_cache_ratio == 0.0) { - new_comp_cache_ratio = 0.05; - } - fprintf(stderr, "New comp cache ratio = %f\n", new_comp_cache_ratio); + fprintf(stdout, "New comp cache ratio = %f\n", new_comp_cache_ratio); s = UpdateTieredCache(block_cache, /*capacity*/ -1, new_comp_cache_ratio); @@ -403,7 +400,7 @@ class MyXXH64Checksum : public FileChecksumGenerator { XXH64_reset(state_, 0); } - virtual ~MyXXH64Checksum() override { XXH64_freeState(state_); } + ~MyXXH64Checksum() override { XXH64_freeState(state_); } void Update(const char* data, size_t n) override { XXH64_update(state_, data, n); diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 485400e05b6..4a426f44ca8 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -313,6 +313,7 @@ DECLARE_uint32(memtable_protection_bytes_per_key); DECLARE_uint32(block_protection_bytes_per_key); DECLARE_uint64(user_timestamp_size); +DECLARE_bool(persist_user_defined_timestamps); DECLARE_string(secondary_cache_uri); DECLARE_int32(secondary_cache_fault_one_in); @@ -350,8 +351,47 @@ DECLARE_uint64(readahead_size); DECLARE_uint64(initial_auto_readahead_size); DECLARE_uint64(max_auto_readahead_size); DECLARE_uint64(num_file_reads_for_auto_readahead); -DECLARE_bool(use_io_uring); DECLARE_bool(auto_readahead_size); +DECLARE_bool(allow_fallocate); +DECLARE_int32(table_cache_numshardbits); +DECLARE_bool(enable_write_thread_adaptive_yield); +DECLARE_uint64(log_readahead_size); +DECLARE_uint64(bgerror_resume_retry_interval); +DECLARE_uint64(delete_obsolete_files_period_micros); +DECLARE_uint64(max_log_file_size); +DECLARE_uint64(log_file_time_to_roll); +DECLARE_bool(use_adaptive_mutex); +DECLARE_bool(advise_random_on_open); +DECLARE_uint64(WAL_ttl_seconds); +DECLARE_uint64(WAL_size_limit_MB); +DECLARE_bool(strict_bytes_per_sync); +DECLARE_bool(avoid_flush_during_shutdown); +DECLARE_bool(fill_cache); +DECLARE_bool(optimize_multiget_for_io); +DECLARE_bool(memtable_insert_hint_per_batch); +DECLARE_bool(dump_malloc_stats); +DECLARE_uint64(stats_history_buffer_size); +DECLARE_bool(skip_stats_update_on_db_open); +DECLARE_bool(optimize_filters_for_hits); +DECLARE_uint64(sample_for_compression); +DECLARE_bool(report_bg_io_stats); +DECLARE_bool(cache_index_and_filter_blocks_with_high_priority); +DECLARE_bool(use_delta_encoding); +DECLARE_bool(verify_compression); +DECLARE_uint32(read_amp_bytes_per_bit); +DECLARE_bool(enable_index_compression); +DECLARE_uint32(index_shortening); +DECLARE_uint32(metadata_charge_policy); +DECLARE_bool(use_adaptive_mutex_lru); +DECLARE_uint32(compress_format_version); +DECLARE_uint64(manifest_preallocation_size); +DECLARE_bool(enable_checksum_handoff); +DECLARE_uint64(max_total_wal_size); +DECLARE_double(high_pri_pool_ratio); +DECLARE_double(low_pri_pool_ratio); +DECLARE_uint64(soft_pending_compaction_bytes_limit); +DECLARE_uint64(hard_pending_compaction_bytes_limit); +DECLARE_uint64(max_sequential_skip_in_iterations); constexpr long KB = 1024; constexpr int kRandomValueMaxFactor = 3; @@ -488,7 +528,7 @@ inline bool GetNextPrefix(const ROCKSDB_NAMESPACE::Slice& src, std::string* v) { #endif // Append `val` to `*key` in fixed-width big-endian format -extern inline void AppendIntToString(uint64_t val, std::string* key) { +inline void AppendIntToString(uint64_t val, std::string* key) { // PutFixed64 uses little endian PutFixed64(key, val); // Reverse to get big endian @@ -517,7 +557,7 @@ extern KeyGenContext key_gen_ctx; // - {0}...{x-1} // {(x-1),0}..{(x-1),(y-1)},{(x-1),(y-1),0}..{(x-1),(y-1),(z-1)} and so on. // Additionally, a trailer of 0-7 bytes could be appended. -extern inline std::string Key(int64_t val) { +inline std::string Key(int64_t val) { uint64_t window = key_gen_ctx.window; size_t levels = key_gen_ctx.weights.size(); std::string key; @@ -555,7 +595,7 @@ extern inline std::string Key(int64_t val) { } // Given a string key, map it to an index into the expected values buffer -extern inline bool GetIntVal(std::string big_endian_key, uint64_t* key_p) { +inline bool GetIntVal(std::string big_endian_key, uint64_t* key_p) { size_t size_key = big_endian_key.size(); std::vector prefixes; @@ -610,8 +650,8 @@ inline bool GetFirstIntValInPrefix(std::string big_endian_prefix, return GetIntVal(std::move(big_endian_prefix), key_p); } -extern inline uint64_t GetPrefixKeyCount(const std::string& prefix, - const std::string& ub) { +inline uint64_t GetPrefixKeyCount(const std::string& prefix, + const std::string& ub) { uint64_t start = 0; uint64_t end = 0; @@ -623,7 +663,7 @@ extern inline uint64_t GetPrefixKeyCount(const std::string& prefix, return end - start; } -extern inline std::string StringToHex(const std::string& str) { +inline std::string StringToHex(const std::string& str) { std::string result = "0x"; result.append(Slice(str).ToString(true)); return result; @@ -642,49 +682,49 @@ inline std::string WideColumnsToHex(const WideColumns& columns) { } // Unified output format for double parameters -extern inline std::string FormatDoubleParam(double param) { +inline std::string FormatDoubleParam(double param) { return std::to_string(param); } // Make sure that double parameter is a value we can reproduce by // re-inputting the value printed. -extern inline void SanitizeDoubleParam(double* param) { +inline void SanitizeDoubleParam(double* param) { *param = std::atof(FormatDoubleParam(*param).c_str()); } -extern void PoolSizeChangeThread(void* v); +void PoolSizeChangeThread(void* v); -extern void DbVerificationThread(void* v); +void DbVerificationThread(void* v); -extern void CompressedCacheSetCapacityThread(void* v); +void CompressedCacheSetCapacityThread(void* v); -extern void TimestampedSnapshotsThread(void* v); +void TimestampedSnapshotsThread(void* v); -extern void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz); +void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz); -extern int64_t GenerateOneKey(ThreadState* thread, uint64_t iteration); +int64_t GenerateOneKey(ThreadState* thread, uint64_t iteration); -extern std::vector GenerateNKeys(ThreadState* thread, int num_keys, - uint64_t iteration); +std::vector GenerateNKeys(ThreadState* thread, int num_keys, + uint64_t iteration); -extern size_t GenerateValue(uint32_t rand, char* v, size_t max_sz); -extern uint32_t GetValueBase(Slice s); +size_t GenerateValue(uint32_t rand, char* v, size_t max_sz); +uint32_t GetValueBase(Slice s); -extern WideColumns GenerateWideColumns(uint32_t value_base, const Slice& slice); -extern WideColumns GenerateExpectedWideColumns(uint32_t value_base, - const Slice& slice); -extern bool VerifyWideColumns(const Slice& value, const WideColumns& columns); -extern bool VerifyWideColumns(const WideColumns& columns); +WideColumns GenerateWideColumns(uint32_t value_base, const Slice& slice); +WideColumns GenerateExpectedWideColumns(uint32_t value_base, + const Slice& slice); +bool VerifyWideColumns(const Slice& value, const WideColumns& columns); +bool VerifyWideColumns(const WideColumns& columns); -extern StressTest* CreateCfConsistencyStressTest(); -extern StressTest* CreateBatchedOpsStressTest(); -extern StressTest* CreateNonBatchedOpsStressTest(); -extern StressTest* CreateMultiOpsTxnsStressTest(); -extern void CheckAndSetOptionsForMultiOpsTxnStressTest(); -extern void InitializeHotKeyGenerator(double alpha); -extern int64_t GetOneHotKeyID(double rand_seed, int64_t max_key); +StressTest* CreateCfConsistencyStressTest(); +StressTest* CreateBatchedOpsStressTest(); +StressTest* CreateNonBatchedOpsStressTest(); +StressTest* CreateMultiOpsTxnsStressTest(); +void CheckAndSetOptionsForMultiOpsTxnStressTest(); +void InitializeHotKeyGenerator(double alpha); +int64_t GetOneHotKeyID(double rand_seed, int64_t max_key); -extern std::string GetNowNanos(); +std::string GetNowNanos(); std::shared_ptr GetFileChecksumImpl( const std::string& name); diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc index 92730beca2b..b47fa89e6ae 100644 --- a/db_stress_tool/db_stress_driver.cc +++ b/db_stress_tool/db_stress_driver.cc @@ -15,7 +15,7 @@ namespace ROCKSDB_NAMESPACE { void ThreadBody(void* v) { ThreadStatusUtil::RegisterThread(db_stress_env, ThreadStatus::USER); - ThreadState* thread = reinterpret_cast(v); + ThreadState* thread = static_cast(v); SharedState* shared = thread->shared; if (!FLAGS_skip_verifydb && shared->ShouldVerifyAtBeginning()) { diff --git a/db_stress_tool/db_stress_driver.h b/db_stress_tool/db_stress_driver.h index a173470ff7d..755d56ff02b 100644 --- a/db_stress_tool/db_stress_driver.h +++ b/db_stress_tool/db_stress_driver.h @@ -12,7 +12,7 @@ #pragma once #include "db_stress_tool/db_stress_test_base.h" namespace ROCKSDB_NAMESPACE { -extern void ThreadBody(void* /*thread_state*/); -extern bool RunStressTest(SharedState*); +void ThreadBody(void* /*thread_state*/); +bool RunStressTest(SharedState*); } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/db_stress_env_wrapper.h b/db_stress_tool/db_stress_env_wrapper.h index 83e6838c703..ab4a074fc7a 100644 --- a/db_stress_tool/db_stress_env_wrapper.h +++ b/db_stress_tool/db_stress_env_wrapper.h @@ -60,7 +60,7 @@ class DbStressRandomAccessFileWrapper : public FSRandomAccessFileOwnerWrapper { } IOStatus ReadAsync(FSReadRequest& req, const IOOptions& options, - std::function cb, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override { #ifndef NDEBUG @@ -76,6 +76,155 @@ class DbStressRandomAccessFileWrapper : public FSRandomAccessFileOwnerWrapper { } }; +class DbStressWritableFileWrapper : public FSWritableFileOwnerWrapper { + public: + explicit DbStressWritableFileWrapper(std::unique_ptr&& target) + : FSWritableFileOwnerWrapper(std::move(target)) {} + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Append(data, options, dbg); + } + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Append(data, options, verification_info, dbg); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->PositionedAppend(data, offset, options, dbg); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->PositionedAppend(data, offset, options, verification_info, + dbg); + } + + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Truncate(size, options, dbg); + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Close(options, dbg); + } + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Flush(options, dbg); + } + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Sync(options, dbg); + } + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Fsync(options, dbg); + } + +#ifdef ROCKSDB_FALLOCATE_PRESENT + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Allocate(offset, len, options, dbg); + } +#endif + + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->RangeSync(offset, nbytes, options, dbg); + } +}; + class DbStressFSWrapper : public FileSystemWrapper { public: explicit DbStressFSWrapper(const std::shared_ptr& t) @@ -95,6 +244,17 @@ class DbStressFSWrapper : public FileSystemWrapper { return s; } + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + std::unique_ptr file; + IOStatus s = target()->NewWritableFile(f, file_opts, &file, dbg); + if (s.ok()) { + r->reset(new DbStressWritableFileWrapper(std::move(file))); + } + return s; + } + IOStatus DeleteFile(const std::string& f, const IOOptions& opts, IODebugContext* dbg) override { // We determine whether it is a manifest file by searching a strong, diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index cd1c978b810..bada4396150 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -7,6 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "rocksdb/cache.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/backup_engine.h" #ifdef GFLAGS #include "db_stress_tool/db_stress_common.h" @@ -33,6 +36,11 @@ DEFINE_int64(max_key, 1 * KB * KB, DEFINE_int32(max_key_len, 3, "Maximum length of a key in 8-byte units"); +DEFINE_uint64(max_sequential_skip_in_iterations, + ROCKSDB_NAMESPACE::Options().max_sequential_skip_in_iterations, + "Iterator will reseek after scanning this number of keys with" + "the same user key during Next/Prev()."); + DEFINE_string(key_len_percent_dist, "", "Percentages of keys of various lengths. For example, 1,30,69 " "means 1% of keys are 8 bytes, 30% are 16 bytes, and 69% are " @@ -402,7 +410,8 @@ DEFINE_double(experimental_mempurge_threshold, 0.0, "Maximum estimated useful payload that triggers a " "mempurge process to collect memtable garbage bytes."); -DEFINE_bool(enable_write_thread_adaptive_yield, true, +DEFINE_bool(enable_write_thread_adaptive_yield, + ROCKSDB_NAMESPACE::Options().enable_write_thread_adaptive_yield, "Use a yielding spin loop for brief writer thread waits."); // Options for StackableDB-based BlobDB @@ -1025,6 +1034,10 @@ DEFINE_uint64(user_timestamp_size, 0, "Number of bytes for a user-defined timestamp. Currently, only " "8-byte is supported"); +DEFINE_bool(persist_user_defined_timestamps, true, + "Flag to indicate whether user-defined timestamps will be persisted" + " during Flush"); + DEFINE_int32(open_metadata_write_fault_one_in, 0, "On non-zero, enables fault injection on file metadata write " "during DB reopen."); @@ -1125,11 +1138,9 @@ DEFINE_uint64(stats_dump_period_sec, ROCKSDB_NAMESPACE::Options().stats_dump_period_sec, "Gap between printing stats to log in seconds"); -DEFINE_bool(use_io_uring, false, "Enable the use of IO uring on Posix"); - DEFINE_bool(verification_only, false, "If true, tests will only execute verification step"); -extern "C" bool RocksDbIOUringEnable() { return FLAGS_use_io_uring; } +extern "C" bool RocksDbIOUringEnable() { return true; } DEFINE_uint32(memtable_max_range_deletions, 0, "If nonzero, RocksDB will try to flush the current memtable" @@ -1142,4 +1153,155 @@ DEFINE_uint32(bottommost_file_compaction_delay, 0, DEFINE_bool(auto_readahead_size, false, "Does auto tuning of readahead_size when enabled during scans."); +DEFINE_bool(allow_fallocate, ROCKSDB_NAMESPACE::Options().allow_fallocate, + "Options.allow_fallocate"); + +DEFINE_int32(table_cache_numshardbits, + ROCKSDB_NAMESPACE::Options().table_cache_numshardbits, + "Options.table_cache_numshardbits"); + +DEFINE_uint64(log_readahead_size, + ROCKSDB_NAMESPACE::Options().log_readahead_size, + "Options.log_readahead_size"); + +DEFINE_uint64(bgerror_resume_retry_interval, + ROCKSDB_NAMESPACE::Options().bgerror_resume_retry_interval, + "Options.bgerror_resume_retry_interval"); + +DEFINE_uint64(delete_obsolete_files_period_micros, + ROCKSDB_NAMESPACE::Options().delete_obsolete_files_period_micros, + "Options.delete_obsolete_files_period_micros"); + +DEFINE_uint64(max_log_file_size, ROCKSDB_NAMESPACE::Options().max_log_file_size, + "Options.max_log_file_sizes"); + +DEFINE_uint64(log_file_time_to_roll, + ROCKSDB_NAMESPACE::Options().log_file_time_to_roll, + "Options.log_file_time_to_roll"); + +DEFINE_bool(use_adaptive_mutex, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex, + "Options.use_adaptive_mutex"); + +DEFINE_bool(advise_random_on_open, + ROCKSDB_NAMESPACE::Options().advise_random_on_open, + "Options.advise_random_on_open"); + +DEFINE_uint64(WAL_ttl_seconds, ROCKSDB_NAMESPACE::Options().WAL_ttl_seconds, + "Options.WAL_ttl_seconds"); + +DEFINE_uint64(WAL_size_limit_MB, ROCKSDB_NAMESPACE::Options().WAL_size_limit_MB, + "Options.WAL_size_limit_MB"); + +DEFINE_bool(strict_bytes_per_sync, + ROCKSDB_NAMESPACE::Options().strict_bytes_per_sync, + "Options.strict_bytes_per_sync"); + +DEFINE_bool(avoid_flush_during_shutdown, + ROCKSDB_NAMESPACE::Options().avoid_flush_during_shutdown, + "Options.avoid_flush_during_shutdown"); + +DEFINE_bool(fill_cache, ROCKSDB_NAMESPACE::ReadOptions().fill_cache, + "ReadOptions.fill_cache"); + +DEFINE_bool(optimize_multiget_for_io, + ROCKSDB_NAMESPACE::ReadOptions().optimize_multiget_for_io, + "ReadOptions.optimize_multiget_for_io"); + +DEFINE_bool(memtable_insert_hint_per_batch, + ROCKSDB_NAMESPACE::WriteOptions().memtable_insert_hint_per_batch, + "WriteOptions.memtable_insert_hint_per_batch"); + +DEFINE_bool(dump_malloc_stats, ROCKSDB_NAMESPACE::Options().dump_malloc_stats, + "Options.dump_malloc_stats"); + +DEFINE_uint64(stats_history_buffer_size, + ROCKSDB_NAMESPACE::Options().stats_history_buffer_size, + "Options.stats_history_buffer_size"); + +DEFINE_bool(skip_stats_update_on_db_open, + ROCKSDB_NAMESPACE::Options().skip_stats_update_on_db_open, + "Options.skip_stats_update_on_db_open"); + +DEFINE_bool(optimize_filters_for_hits, + ROCKSDB_NAMESPACE::Options().optimize_filters_for_hits, + "Options.optimize_filters_for_hits"); + +DEFINE_uint64(sample_for_compression, + ROCKSDB_NAMESPACE::Options().sample_for_compression, + "Options.sample_for_compression"); + +DEFINE_bool(report_bg_io_stats, ROCKSDB_NAMESPACE::Options().report_bg_io_stats, + "Options.report_bg_io_stats"); + +DEFINE_bool( + cache_index_and_filter_blocks_with_high_priority, + ROCKSDB_NAMESPACE::BlockBasedTableOptions() + .cache_index_and_filter_blocks_with_high_priority, + "BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority"); + +DEFINE_bool(use_delta_encoding, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().use_delta_encoding, + "BlockBasedTableOptions.use_delta_encoding"); + +DEFINE_bool(verify_compression, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().verify_compression, + "BlockBasedTableOptions.verify_compression"); + +DEFINE_uint32( + read_amp_bytes_per_bit, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit, + "Options.read_amp_bytes_per_bit"); + +DEFINE_bool( + enable_index_compression, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression, + "BlockBasedTableOptions.enable_index_compression"); + +DEFINE_uint32(index_shortening, + static_cast( + ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_shortening), + "BlockBasedTableOptions.index_shortening"); + +DEFINE_uint32(metadata_charge_policy, + static_cast(ROCKSDB_NAMESPACE::ShardedCacheOptions() + .metadata_charge_policy), + "ShardedCacheOptions.metadata_charge_policy"); + +DEFINE_bool(use_adaptive_mutex_lru, + ROCKSDB_NAMESPACE::LRUCacheOptions().use_adaptive_mutex, + "LRUCacheOptions.use_adaptive_mutex"); + +DEFINE_uint32( + compress_format_version, + static_cast(ROCKSDB_NAMESPACE::CompressedSecondaryCacheOptions() + .compress_format_version), + "CompressedSecondaryCacheOptions.compress_format_version"); + +DEFINE_uint64(manifest_preallocation_size, + ROCKSDB_NAMESPACE::Options().manifest_preallocation_size, + "Options.manifest_preallocation_size"); + +DEFINE_uint64(max_total_wal_size, + ROCKSDB_NAMESPACE::Options().max_total_wal_size, + "Options.max_total_wal_size"); + +DEFINE_bool(enable_checksum_handoff, false, + "If true, include all the supported files in " + "Options.checksum_handoff_file. Otherwise include no files."); + +DEFINE_double(high_pri_pool_ratio, + ROCKSDB_NAMESPACE::LRUCacheOptions().high_pri_pool_ratio, + "LRUCacheOptions.high_pri_pool_ratio"); + +DEFINE_double(low_pri_pool_ratio, + ROCKSDB_NAMESPACE::LRUCacheOptions().low_pri_pool_ratio, + "LRUCacheOptions.low_pri_pool_ratio"); + +DEFINE_uint64(soft_pending_compaction_bytes_limit, + ROCKSDB_NAMESPACE::Options().soft_pending_compaction_bytes_limit, + "Options.soft_pending_compaction_bytes_limit"); + +DEFINE_uint64(hard_pending_compaction_bytes_limit, + ROCKSDB_NAMESPACE::Options().hard_pending_compaction_bytes_limit, + "Options.hard_pending_compaction_bytes_limit"); #endif // GFLAGS diff --git a/db_stress_tool/db_stress_listener.cc b/db_stress_tool/db_stress_listener.cc index e2838c582a1..e1405c7c68b 100644 --- a/db_stress_tool/db_stress_listener.cc +++ b/db_stress_tool/db_stress_listener.cc @@ -67,7 +67,7 @@ UniqueIdVerifier::UniqueIdVerifier(const std::string& db_name, Env* env) std::string id(24U, '\0'); Slice result; for (;;) { - s = reader->Read(id.size(), opts, &result, &id[0], /*dbg*/ nullptr); + s = reader->Read(id.size(), opts, &result, id.data(), /*dbg*/ nullptr); if (!s.ok()) { fprintf(stderr, "Error reading unique id file: %s\n", s.ToString().c_str()); @@ -116,9 +116,9 @@ UniqueIdVerifier::UniqueIdVerifier(const std::string& db_name, Env* env) new WritableFileWriter(std::move(file_writer), path_, FileOptions())); if (size > 0) { - st = CopyFile(fs.get(), tmp_path, data_file_writer_, size, - /*use_fsync*/ true, /*io_tracer*/ nullptr, - /*temparature*/ Temperature::kHot); + st = CopyFile(fs.get(), tmp_path, Temperature::kUnknown, data_file_writer_, + size, + /*use_fsync*/ true, /*io_tracer*/ nullptr); if (!st.ok()) { fprintf(stderr, "Error copying contents of old unique id file: %s\n", st.ToString().c_str()); @@ -130,8 +130,13 @@ UniqueIdVerifier::UniqueIdVerifier(const std::string& db_name, Env* env) } UniqueIdVerifier::~UniqueIdVerifier() { - IOStatus s = data_file_writer_->Close(); + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); + IOStatus s; + s = data_file_writer_->Close(IOOptions()); assert(s.ok()); + ThreadStatusUtil::SetThreadOperation(cur_op_type); } void UniqueIdVerifier::VerifyNoWrite(const std::string& id) { @@ -153,13 +158,14 @@ void UniqueIdVerifier::Verify(const std::string& id) { if (id_set_.size() >= 4294967) { return; } - IOStatus s = data_file_writer_->Append(Slice(id)); + IOOptions opts; + IOStatus s = data_file_writer_->Append(opts, Slice(id)); if (!s.ok()) { fprintf(stderr, "Error writing to unique id file: %s\n", s.ToString().c_str()); assert(false); } - s = data_file_writer_->Flush(); + s = data_file_writer_->Flush(opts); if (!s.ok()) { fprintf(stderr, "Error flushing unique id file: %s\n", s.ToString().c_str()); diff --git a/db_stress_tool/db_stress_table_properties_collector.h b/db_stress_tool/db_stress_table_properties_collector.h index d1758cbb4cd..4723f6fc5d2 100644 --- a/db_stress_tool/db_stress_table_properties_collector.h +++ b/db_stress_tool/db_stress_table_properties_collector.h @@ -23,25 +23,25 @@ class DbStressTablePropertiesCollector : public TablePropertiesCollector { : need_compact_(Random::GetTLSInstance()->OneInOpt( FLAGS_mark_for_compaction_one_file_in)) {} - virtual Status AddUserKey(const Slice& /* key */, const Slice& /* value */, - EntryType /*type*/, SequenceNumber /*seq*/, - uint64_t /*file_size*/) override { + Status AddUserKey(const Slice& /* key */, const Slice& /* value */, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { return Status::OK(); } - virtual Status Finish(UserCollectedProperties* /* properties */) override { + Status Finish(UserCollectedProperties* /* properties */) override { return Status::OK(); } - virtual UserCollectedProperties GetReadableProperties() const override { + UserCollectedProperties GetReadableProperties() const override { return UserCollectedProperties{}; } - virtual const char* Name() const override { + const char* Name() const override { return "DbStressTablePropertiesCollector"; } - virtual bool NeedCompact() const override { return need_compact_; } + bool NeedCompact() const override { return need_compact_; } private: const bool need_compact_; @@ -52,12 +52,12 @@ class DbStressTablePropertiesCollector : public TablePropertiesCollector { class DbStressTablePropertiesCollectorFactory : public TablePropertiesCollectorFactory { public: - virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollector* CreateTablePropertiesCollector( TablePropertiesCollectorFactory::Context /* context */) override { return new DbStressTablePropertiesCollector(); } - virtual const char* Name() const override { + const char* Name() const override { return "DbStressTablePropertiesCollectorFactory"; } }; diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 20077558f6d..4510634fffd 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -141,6 +141,7 @@ std::shared_ptr StressTest::NewCache(size_t capacity, } CompressedSecondaryCacheOptions opts; opts.capacity = FLAGS_compressed_secondary_cache_size; + opts.compress_format_version = FLAGS_compress_format_version; secondary_cache = NewCompressedSecondaryCache(opts); if (secondary_cache == nullptr) { fprintf(stderr, "Failed to allocate compressed secondary cache\n"); @@ -191,6 +192,11 @@ std::shared_ptr StressTest::NewCache(size_t capacity, LRUCacheOptions opts; opts.capacity = capacity; opts.num_shard_bits = num_shard_bits; + opts.metadata_charge_policy = + static_cast(FLAGS_metadata_charge_policy); + opts.use_adaptive_mutex = FLAGS_use_adaptive_mutex_lru; + opts.high_pri_pool_ratio = FLAGS_high_pri_pool_ratio; + opts.low_pri_pool_ratio = FLAGS_low_pri_pool_ratio; if (tiered) { TieredCacheOptions tiered_opts; tiered_opts.cache_opts = &opts; @@ -247,6 +253,7 @@ bool StressTest::BuildOptionsTable() { }}, {"memtable_huge_page_size", {"0", std::to_string(2 * 1024 * 1024)}}, {"max_successive_merges", {"0", "2", "4"}}, + {"strict_max_successive_merges", {"false", "true"}}, {"inplace_update_num_locks", {"100", "200", "300"}}, // TODO: re-enable once internal task T124324915 is fixed. // {"experimental_mempurge_threshold", {"0.0", "1.0"}}, @@ -380,7 +387,7 @@ void StressTest::FinishInitDb(SharedState* shared) { if (FLAGS_enable_compaction_filter) { auto* compaction_filter_factory = - reinterpret_cast( + static_cast( options_.compaction_filter_factory.get()); assert(compaction_filter_factory); // This must be called only after any potential `SharedState::Restore()` has @@ -430,6 +437,13 @@ Status StressTest::AssertSame(DB* db, ColumnFamilyHandle* cf, PinnableSlice v; s = db->Get(ropt, cf, snap_state.key, &v); if (!s.ok() && !s.IsNotFound()) { + // When `persist_user_defined_timestamps` is false, a repeated read with + // both a read timestamp and an explicitly taken snapshot cannot guarantee + // consistent result all the time. When it cannot return consistent result, + // it will return an `InvalidArgument` status. + if (s.IsInvalidArgument() && !FLAGS_persist_user_defined_timestamps) { + return Status::OK(); + } return s; } if (snap_state.status != s) { @@ -606,10 +620,11 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, } } - pending_expected_value.Commit(); if (!s.ok()) { + pending_expected_value.Rollback(); break; } + pending_expected_value.Commit(); } if (!s.ok()) { break; @@ -821,10 +836,14 @@ void StressTest::OperateDb(ThreadState* thread) { read_opts.adaptive_readahead = FLAGS_adaptive_readahead; read_opts.readahead_size = FLAGS_readahead_size; read_opts.auto_readahead_size = FLAGS_auto_readahead_size; + read_opts.fill_cache = FLAGS_fill_cache; + read_opts.optimize_multiget_for_io = FLAGS_optimize_multiget_for_io; WriteOptions write_opts; if (FLAGS_rate_limit_auto_wal_flush) { write_opts.rate_limiter_priority = Env::IO_USER; } + write_opts.memtable_insert_hint_per_batch = + FLAGS_memtable_insert_hint_per_batch; auto shared = thread->shared; char value[100]; std::string from_db; @@ -1255,6 +1274,10 @@ Status StressTest::TestIterate(ThreadState* thread, Slice read_ts_slice; MaybeUseOlderTimestampForRangeScan(thread, read_ts_str, read_ts_slice, ro); + std::string op_logs; + ro.pin_data = thread->rand.OneIn(2); + ro.background_purge_on_iterator_cleanup = thread->rand.OneIn(2); + bool expect_total_order = false; if (thread->rand.OneIn(16)) { // When prefix extractor is used, it's useful to cover total order seek. @@ -1306,7 +1329,6 @@ Status StressTest::TestIterate(ThreadState* thread, } } - std::string op_logs; constexpr size_t kOpLogsLimit = 10000; for (const std::string& key_str : key_strs) { @@ -1334,18 +1356,6 @@ Status StressTest::TestIterate(ThreadState* thread, lower_bound = Slice(lower_bound_str); } - // Record some options to op_logs - op_logs += "total_order_seek: "; - op_logs += (ro.total_order_seek ? "1 " : "0 "); - op_logs += "auto_prefix_mode: "; - op_logs += (ro.auto_prefix_mode ? "1 " : "0 "); - if (ro.iterate_upper_bound != nullptr) { - op_logs += "ub: " + upper_bound.ToString(true) + " "; - } - if (ro.iterate_lower_bound != nullptr) { - op_logs += "lb: " + lower_bound.ToString(true) + " "; - } - // Set up an iterator, perform the same operations without bounds and with // total order seek, and compare the results. This is to identify bugs // related to bounds, prefix extractor, or reseeking. Sometimes we are @@ -1522,7 +1532,20 @@ void StressTest::VerifyIterator(ThreadState* thread, ? nullptr : options_.prefix_extractor.get(); const Comparator* cmp = options_.comparator; - + std::ostringstream read_opt_oss; + read_opt_oss << "pin_data: " << ro.pin_data + << ", background_purge_on_iterator_cleanup: " + << ro.background_purge_on_iterator_cleanup + << ", total_order_seek: " << ro.total_order_seek + << ", auto_prefix_mode: " << ro.auto_prefix_mode + << ", iterate_upper_bound: " + << (ro.iterate_upper_bound + ? ro.iterate_upper_bound->ToString(true).c_str() + : "") + << ", iterate_lower_bound: " + << (ro.iterate_lower_bound + ? ro.iterate_lower_bound->ToString(true).c_str() + : ""); if (iter->Valid() && !cmp_iter->Valid()) { if (pe != nullptr) { if (!pe->InDomain(seek_key)) { @@ -1541,8 +1564,10 @@ void StressTest::VerifyIterator(ThreadState* thread, } fprintf(stderr, "Control iterator is invalid but iterator has key %s " - "%s\n", - iter->key().ToString(true).c_str(), op_logs.c_str()); + "%s under specified iterator ReadOptions: %s (Empty string or " + "missing field indicates default option or value is used)\n", + iter->key().ToString(true).c_str(), op_logs.c_str(), + read_opt_oss.str().c_str()); *diverged = true; } else if (cmp_iter->Valid()) { @@ -1570,9 +1595,12 @@ void StressTest::VerifyIterator(ThreadState* thread, } fprintf(stderr, "Iterator stays in prefix but control doesn't" - " iterator key %s control iterator key %s %s\n", + " iterator key %s control iterator key %s %s under specified " + "iterator ReadOptions: %s (Empty string or " + "missing field indicates default option or value is used)\n", iter->key().ToString(true).c_str(), - cmp_iter->key().ToString(true).c_str(), op_logs.c_str()); + cmp_iter->key().ToString(true).c_str(), op_logs.c_str(), + read_opt_oss.str().c_str()); } } // Check upper or lower bounds. @@ -1589,8 +1617,11 @@ void StressTest::VerifyIterator(ThreadState* thread, /*b_has_ts=*/false) > 0))) { fprintf(stderr, "Iterator diverged from control iterator which" - " has value %s %s\n", - total_order_key.ToString(true).c_str(), op_logs.c_str()); + " has value %s %s under specified iterator ReadOptions: %s " + "(Empty string or " + "missing field indicates default option or value is used)\n", + total_order_key.ToString(true).c_str(), op_logs.c_str(), + read_opt_oss.str().c_str()); if (iter->Valid()) { fprintf(stderr, "iterator has value %s\n", iter->key().ToString(true).c_str()); @@ -1672,6 +1703,37 @@ Status StressTest::TestBackupRestore( } else { backup_opts.schema_version = 2; } + if (thread->rand.OneIn(3)) { + backup_opts.max_background_operations = 16; + } else { + backup_opts.max_background_operations = 1; + } + if (thread->rand.OneIn(2)) { + backup_opts.backup_rate_limiter.reset(NewGenericRateLimiter( + FLAGS_backup_max_size * 1000000 /* rate_bytes_per_sec */, + 1 /* refill_period_us */)); + } + if (thread->rand.OneIn(2)) { + backup_opts.restore_rate_limiter.reset(NewGenericRateLimiter( + FLAGS_backup_max_size * 1000000 /* rate_bytes_per_sec */, + 1 /* refill_period_us */)); + } + std::ostringstream backup_opt_oss; + backup_opt_oss << "share_table_files: " << backup_opts.share_table_files + << ", share_files_with_checksum: " + << backup_opts.share_files_with_checksum + << ", share_files_with_checksum_naming: " + << backup_opts.share_files_with_checksum_naming + << ", schema_version: " << backup_opts.schema_version + << ", max_background_operations: " + << backup_opts.max_background_operations + << ", backup_rate_limiter: " + << backup_opts.backup_rate_limiter.get() + << ", restore_rate_limiter: " + << backup_opts.restore_rate_limiter.get(); + + std::ostringstream create_backup_opt_oss; + std::ostringstream restore_opts_oss; BackupEngine* backup_engine = nullptr; std::string from = "a backup/restore operation"; Status s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine); @@ -1698,6 +1760,16 @@ Status StressTest::TestBackupRestore( // lock and wait on a background operation (flush). create_opts.flush_before_backup = true; } + create_opts.decrease_background_thread_cpu_priority = thread->rand.OneIn(2); + create_opts.background_thread_cpu_priority = static_cast( + thread->rand.Next() % (static_cast(CpuPriority::kHigh) + 1)); + create_backup_opt_oss << "flush_before_backup: " + << create_opts.flush_before_backup + << ", decrease_background_thread_cpu_priority: " + << create_opts.decrease_background_thread_cpu_priority + << ", background_thread_cpu_priority: " + << static_cast( + create_opts.background_thread_cpu_priority); s = backup_engine->CreateNewBackup(create_opts, db_); if (!s.ok()) { from = "BackupEngine::CreateNewBackup"; @@ -1735,19 +1807,21 @@ Status StressTest::TestBackupRestore( const bool allow_persistent = thread->tid == 0; // not too many bool from_latest = false; int count = static_cast(backup_info.size()); + RestoreOptions restore_options; + restore_options.keep_log_files = thread->rand.OneIn(2); + restore_opts_oss << "keep_log_files: " << restore_options.keep_log_files; if (s.ok() && !inplace_not_restore) { if (count > 1) { s = backup_engine->RestoreDBFromBackup( - RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id, + restore_options, backup_info[thread->rand.Uniform(count)].backup_id, restore_dir /* db_dir */, restore_dir /* wal_dir */); if (!s.ok()) { from = "BackupEngine::RestoreDBFromBackup"; } } else { from_latest = true; - s = backup_engine->RestoreDBFromLatestBackup(RestoreOptions(), - restore_dir /* db_dir */, - restore_dir /* wal_dir */); + s = backup_engine->RestoreDBFromLatestBackup( + restore_options, restore_dir /* db_dir */, restore_dir /* wal_dir */); if (!s.ok()) { from = "BackupEngine::RestoreDBFromLatestBackup"; } @@ -1770,9 +1844,9 @@ Status StressTest::TestBackupRestore( std::vector restored_cf_handles; // Not yet implemented: opening restored BlobDB or TransactionDB - Options restore_options; + Options db_opt; if (s.ok() && !FLAGS_use_txn && !FLAGS_use_blob_db) { - s = PrepareOptionsForRestoredDB(&restore_options); + s = PrepareOptionsForRestoredDB(&db_opt); if (!s.ok()) { from = "PrepareRestoredDBOptions in backup/restore"; } @@ -1784,20 +1858,20 @@ Status StressTest::TestBackupRestore( // `ListColumnFamilies` to get names because it won't necessarily give // the same order as `column_family_names_`. assert(FLAGS_clear_column_family_one_in == 0); - for (auto name : column_family_names_) { - cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options)); + for (const auto& name : column_family_names_) { + cf_descriptors.emplace_back(name, ColumnFamilyOptions(db_opt)); } if (inplace_not_restore) { BackupInfo& info = backup_info[thread->rand.Uniform(count)]; - restore_options.env = info.env_for_open.get(); - s = DB::OpenForReadOnly(DBOptions(restore_options), info.name_for_open, + db_opt.env = info.env_for_open.get(); + s = DB::OpenForReadOnly(DBOptions(db_opt), info.name_for_open, cf_descriptors, &restored_cf_handles, &restored_db); if (!s.ok()) { from = "DB::OpenForReadOnly in backup/restore"; } } else { - s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors, + s = DB::Open(DBOptions(db_opt), restore_dir, cf_descriptors, &restored_cf_handles, &restored_db); if (!s.ok()) { from = "DB::Open in backup/restore"; @@ -1889,8 +1963,13 @@ Status StressTest::TestBackupRestore( } } if (!s.ok() && (!s.IsIOError() || !std::strstr(s.getState(), "injected"))) { - fprintf(stderr, "Failure in %s with: %s\n", from.c_str(), - s.ToString().c_str()); + fprintf(stderr, + "Failure in %s with: %s under specified BackupEngineOptions: %s, " + "CreateBackupOptions: %s, RestoreOptions: %s (Empty string or " + "missing field indicates default option or value is used)\n", + from.c_str(), s.ToString().c_str(), backup_opt_oss.str().c_str(), + create_backup_opt_oss.str().c_str(), + restore_opts_oss.str().c_str()); } return s; } @@ -2239,12 +2318,31 @@ void StressTest::TestCompactFiles(ThreadState* thread, size_t output_level = std::min(random_level + 1, cf_meta_data.levels.size() - 1); - auto s = db_->CompactFiles(CompactionOptions(), column_family, - input_files, static_cast(output_level)); + CompactionOptions compact_options; + if (thread->rand.OneIn(2)) { + compact_options.output_file_size_limit = FLAGS_target_file_size_base; + } + std::ostringstream compact_opt_oss; + compact_opt_oss << "output_file_size_limit: " + << compact_options.output_file_size_limit; + auto s = db_->CompactFiles(compact_options, column_family, input_files, + static_cast(output_level)); if (!s.ok()) { - fprintf(stdout, "Unable to perform CompactFiles(): %s\n", - s.ToString().c_str()); + // TOOD (hx235): allow an exact list of tolerable failures under stress + // test + bool non_ok_status_allowed = + s.IsManualCompactionPaused() || + (s.getState() && std::strstr(s.getState(), "injected")) || + s.IsAborted() || s.IsInvalidArgument() || s.IsNotSupported(); + fprintf(non_ok_status_allowed ? stdout : stderr, + "Unable to perform CompactFiles(): %s under specified " + "CompactionOptions: %s (Empty string or " + "missing field indicates default option or value is used)\n", + s.ToString().c_str(), compact_opt_oss.str().c_str()); thread->stats.AddNumCompactFilesFailed(1); + if (!non_ok_status_allowed) { + thread->shared->SafeTerminate(); + } } else { thread->stats.AddNumCompactFilesSucceed(1); } @@ -2387,6 +2485,9 @@ void StressTest::TestCompactRange(ThreadState* thread, int64_t rand_key, CompactRangeOptions cro; cro.exclusive_manual_compaction = static_cast(thread->rand.Next() % 2); cro.change_level = static_cast(thread->rand.Next() % 2); + if (thread->rand.OneIn(2)) { + cro.target_level = thread->rand.Next() % options_.num_levels; + } std::vector bottom_level_styles = { BottommostLevelCompaction::kSkip, BottommostLevelCompaction::kIfHaveCompactionFilter, @@ -2416,12 +2517,36 @@ void StressTest::TestCompactRange(ThreadState* thread, int64_t rand_key, pre_hash = GetRangeHash(thread, pre_snapshot, column_family, start_key, end_key); } - + std::ostringstream compact_range_opt_oss; + compact_range_opt_oss << "exclusive_manual_compaction: " + << cro.exclusive_manual_compaction + << ", change_level: " << cro.change_level + << ", target_level: " << cro.target_level + << ", bottommost_level_compaction: " + << static_cast(cro.bottommost_level_compaction) + << ", allow_write_stall: " << cro.allow_write_stall + << ", max_subcompactions: " << cro.max_subcompactions + << ", blob_garbage_collection_policy: " + << static_cast(cro.blob_garbage_collection_policy) + << ", blob_garbage_collection_age_cutoff: " + << cro.blob_garbage_collection_age_cutoff; Status status = db_->CompactRange(cro, column_family, &start_key, &end_key); if (!status.ok()) { - fprintf(stdout, "Unable to perform CompactRange(): %s\n", - status.ToString().c_str()); + // TOOD (hx235): allow an exact list of tolerable failures under stress test + bool non_ok_status_allowed = + status.IsManualCompactionPaused() || + (status.getState() && std::strstr(status.getState(), "injected")) || + status.IsInvalidArgument() || status.IsNotSupported(); + fprintf(non_ok_status_allowed ? stdout : stderr, + "Unable to perform CompactRange(): %s under specified " + "CompactRangeOptions: %s (Empty string or " + "missing field indicates default option or value is used)\n", + status.ToString().c_str(), compact_range_opt_oss.str().c_str()); + if (!non_ok_status_allowed) { + // Fail fast to preserve the DB state. + thread->shared->SetVerificationFailure(); + } } if (pre_snapshot != nullptr) { @@ -2430,8 +2555,11 @@ void StressTest::TestCompactRange(ThreadState* thread, int64_t rand_key, if (pre_hash != post_hash) { fprintf(stderr, "Data hash different before and after compact range " - "start_key %s end_key %s\n", - start_key.ToString(true).c_str(), end_key.ToString(true).c_str()); + "start_key %s end_key %s under specified CompactRangeOptions: %s " + "(Empty string or " + "missing field indicates default option or value is used)\n", + start_key.ToString(true).c_str(), end_key.ToString(true).c_str(), + compact_range_opt_oss.str().c_str()); thread->stats.AddErrors(1); // Fail fast to preserve the DB state. thread->shared->SetVerificationFailure(); @@ -2668,6 +2796,8 @@ void StressTest::PrintEnv() const { static_cast(FLAGS_fail_if_options_file_error)); fprintf(stdout, "User timestamp size bytes : %d\n", static_cast(FLAGS_user_timestamp_size)); + fprintf(stdout, "Persist user defined timestamps : %d\n", + FLAGS_persist_user_defined_timestamps); fprintf(stdout, "WAL compression : %s\n", FLAGS_wal_compression.c_str()); fprintf(stdout, "Try verify sst unique id : %d\n", @@ -2691,7 +2821,7 @@ void StressTest::Open(SharedState* shared, bool reopen) { exit(1); } if (FLAGS_prefix_size != 0 && FLAGS_rep_factory != kHashSkipList) { - fprintf(stderr, + fprintf(stdout, "WARNING: prefix_size is non-zero but " "memtablerep != prefix_hash\n"); } @@ -2762,12 +2892,12 @@ void StressTest::Open(SharedState* shared, bool reopen) { if (sorted_cfn != existing_column_families) { fprintf(stderr, "Expected column families differ from the existing:\n"); fprintf(stderr, "Expected: {"); - for (auto cf : sorted_cfn) { + for (const auto& cf : sorted_cfn) { fprintf(stderr, "%s ", cf.c_str()); } fprintf(stderr, "}\n"); fprintf(stderr, "Existing: {"); - for (auto cf : existing_column_families) { + for (const auto& cf : existing_column_families) { fprintf(stderr, "%s ", cf.c_str()); } fprintf(stderr, "}\n"); @@ -2775,7 +2905,7 @@ void StressTest::Open(SharedState* shared, bool reopen) { assert(sorted_cfn == existing_column_families); } std::vector cf_descriptors; - for (auto name : column_family_names_) { + for (const auto& name : column_family_names_) { if (name != kDefaultColumnFamilyName) { new_column_family_name_ = std::max(new_column_family_name_.load(), std::stoi(name) + 1); @@ -3087,6 +3217,11 @@ bool StressTest::MaybeUseOlderTimestampForPointLookup(ThreadState* thread, return false; } + if (!FLAGS_persist_user_defined_timestamps) { + // Not read with older timestamps to avoid get InvalidArgument. + return false; + } + assert(thread); if (!thread->rand.OneInOpt(3)) { return false; @@ -3116,6 +3251,11 @@ void StressTest::MaybeUseOlderTimestampForRangeScan(ThreadState* thread, return; } + if (!FLAGS_persist_user_defined_timestamps) { + // Not read with older timestamps to avoid get InvalidArgument. + return; + } + assert(thread); if (!thread->rand.OneInOpt(3)) { return; @@ -3175,6 +3315,8 @@ void CheckAndSetOptionsForUserTimestamp(Options& options) { exit(1); } options.comparator = cmp; + options.persist_user_defined_timestamps = + FLAGS_persist_user_defined_timestamps; } bool InitializeOptionsFromFile(Options& options) { @@ -3263,6 +3405,15 @@ void InitializeOptionsFromFlags( block_based_options.max_auto_readahead_size = FLAGS_max_auto_readahead_size; block_based_options.num_file_reads_for_auto_readahead = FLAGS_num_file_reads_for_auto_readahead; + block_based_options.cache_index_and_filter_blocks_with_high_priority = + FLAGS_cache_index_and_filter_blocks_with_high_priority; + block_based_options.use_delta_encoding = FLAGS_use_delta_encoding; + block_based_options.verify_compression = FLAGS_verify_compression; + block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit; + block_based_options.enable_index_compression = FLAGS_enable_index_compression; + block_based_options.index_shortening = + static_cast( + FLAGS_index_shortening); options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); options.db_write_buffer_size = FLAGS_db_write_buffer_size; options.write_buffer_size = FLAGS_write_buffer_size; @@ -3330,7 +3481,7 @@ void InitializeOptionsFromFlags( FLAGS_compression_use_zstd_dict_trainer; } else if (!FLAGS_compression_use_zstd_dict_trainer) { fprintf( - stderr, + stdout, "WARNING: use_zstd_dict_trainer is false but zstd finalizeDictionary " "cannot be used because ZSTD 1.4.5+ is not linked with the binary." " zstd dictionary trainer will be used.\n"); @@ -3422,7 +3573,7 @@ void InitializeOptionsFromFlags( StringToCompressionType(FLAGS_wal_compression.c_str()); if (FLAGS_enable_tiered_storage) { - options.bottommost_temperature = Temperature::kCold; + options.last_level_temperature = Temperature::kCold; } options.preclude_last_level_data_seconds = FLAGS_preclude_last_level_data_seconds; @@ -3463,6 +3614,43 @@ void InitializeOptionsFromFlags( options.bottommost_file_compaction_delay = FLAGS_bottommost_file_compaction_delay; + + options.allow_fallocate = FLAGS_allow_fallocate; + options.table_cache_numshardbits = FLAGS_table_cache_numshardbits; + options.log_readahead_size = FLAGS_log_readahead_size; + options.bgerror_resume_retry_interval = FLAGS_bgerror_resume_retry_interval; + options.delete_obsolete_files_period_micros = + FLAGS_delete_obsolete_files_period_micros; + options.max_log_file_size = FLAGS_max_log_file_size; + options.log_file_time_to_roll = FLAGS_log_file_time_to_roll; + options.use_adaptive_mutex = FLAGS_use_adaptive_mutex; + options.advise_random_on_open = FLAGS_advise_random_on_open; + // TODO (hx235): test the functionality of `WAL_ttl_seconds`, + // `WAL_size_limit_MB` i.e, `GetUpdatesSince()` + options.WAL_ttl_seconds = FLAGS_WAL_ttl_seconds; + options.WAL_size_limit_MB = FLAGS_WAL_size_limit_MB; + options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync; + options.strict_bytes_per_sync = FLAGS_strict_bytes_per_sync; + options.avoid_flush_during_shutdown = FLAGS_avoid_flush_during_shutdown; + options.dump_malloc_stats = FLAGS_dump_malloc_stats; + options.stats_history_buffer_size = FLAGS_stats_history_buffer_size; + options.skip_stats_update_on_db_open = FLAGS_skip_stats_update_on_db_open; + options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits; + options.sample_for_compression = FLAGS_sample_for_compression; + options.report_bg_io_stats = FLAGS_report_bg_io_stats; + options.manifest_preallocation_size = FLAGS_manifest_preallocation_size; + if (FLAGS_enable_checksum_handoff) { + options.checksum_handoff_file_types = {FileTypeSet::All()}; + } else { + options.checksum_handoff_file_types = {}; + } + options.max_total_wal_size = FLAGS_max_total_wal_size; + options.soft_pending_compaction_bytes_limit = + FLAGS_soft_pending_compaction_bytes_limit; + options.hard_pending_compaction_bytes_limit = + FLAGS_hard_pending_compaction_bytes_limit; + options.max_sequential_skip_in_iterations = + FLAGS_max_sequential_skip_in_iterations; } void InitializeOptionsGeneral( diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h index 424570b33c9..f578ae2a0d9 100644 --- a/db_stress_tool/db_stress_test_base.h +++ b/db_stress_tool/db_stress_test_base.h @@ -291,13 +291,13 @@ class StressTest { }; // Load options from OPTIONS file and populate `options`. -extern bool InitializeOptionsFromFile(Options& options); +bool InitializeOptionsFromFile(Options& options); // Initialize `options` using command line arguments. // When this function is called, `cache`, `block_cache_compressed`, // `filter_policy` have all been initialized. Therefore, we just pass them as // input arguments. -extern void InitializeOptionsFromFlags( +void InitializeOptionsFromFlags( const std::shared_ptr& cache, const std::shared_ptr& filter_policy, Options& options); @@ -322,7 +322,7 @@ extern void InitializeOptionsFromFlags( // // InitializeOptionsGeneral() must not overwrite fields of `options` loaded // from OPTIONS file. -extern void InitializeOptionsGeneral( +void InitializeOptionsGeneral( const std::shared_ptr& cache, const std::shared_ptr& filter_policy, Options& options); @@ -330,7 +330,7 @@ extern void InitializeOptionsGeneral( // user-defined timestamp which requires `-user_timestamp_size=8`. // This function also checks for known (currently) incompatible features with // user-defined timestamp. -extern void CheckAndSetOptionsForUserTimestamp(Options& options); +void CheckAndSetOptionsForUserTimestamp(Options& options); } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc index 9c24e2c4251..54147ea9caa 100644 --- a/db_stress_tool/db_stress_tool.cc +++ b/db_stress_tool/db_stress_tool.cc @@ -100,17 +100,6 @@ int db_stress_tool(int argc, char** argv) { env_wrapper_guard = std::make_shared( raw_env, std::make_shared(raw_env->GetFileSystem())); - if (!env_opts && !FLAGS_use_io_uring) { - // If using the default Env (Posix), wrap DbStressEnvWrapper with the - // legacy EnvWrapper. This is a workaround to prevent MultiGet and scans - // from failing when IO uring is disabled. The EnvWrapper - // has a default implementation of ReadAsync that redirects to Read. - legacy_env_wrapper_guard = std::make_shared(raw_env); - env_wrapper_guard = std::make_shared( - legacy_env_wrapper_guard, - std::make_shared( - legacy_env_wrapper_guard->GetFileSystem())); - } db_stress_env = env_wrapper_guard.get(); FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); diff --git a/db_stress_tool/expected_state.cc b/db_stress_tool/expected_state.cc index b483e154c45..91be3bea630 100644 --- a/db_stress_tool/expected_state.cc +++ b/db_stress_tool/expected_state.cc @@ -79,6 +79,8 @@ std::vector ExpectedState::PrepareDeleteRange( PrepareDelete(cf, key, &prepared); if (prepared) { pending_expected_values.push_back(pending_expected_value); + } else { + pending_expected_value.PermitUnclosedPendingState(); } } return pending_expected_values; @@ -185,7 +187,7 @@ ExpectedStateManager::ExpectedStateManager(size_t max_key, num_column_families_(num_column_families), latest_(nullptr) {} -ExpectedStateManager::~ExpectedStateManager() {} +ExpectedStateManager::~ExpectedStateManager() = default; const std::string FileExpectedStateManager::kLatestBasename = "LATEST"; const std::string FileExpectedStateManager::kStateFilenameSuffix = ".state"; @@ -306,9 +308,10 @@ Status FileExpectedStateManager::SaveAtAndAfter(DB* db) { // Populate a tempfile and then rename it to atomically create ".state" // with contents from "LATEST.state" - Status s = CopyFile(FileSystem::Default(), latest_file_path, - state_file_temp_path, 0 /* size */, false /* use_fsync */, - nullptr /* io_tracer */, Temperature::kUnknown); + Status s = + CopyFile(FileSystem::Default(), latest_file_path, Temperature::kUnknown, + state_file_temp_path, Temperature::kUnknown, 0 /* size */, + false /* use_fsync */, nullptr /* io_tracer */); if (s.ok()) { s = FileSystem::Default()->RenameFile(state_file_temp_path, state_file_path, IOOptions(), nullptr /* dbg */); @@ -631,9 +634,9 @@ Status FileExpectedStateManager::Restore(DB* db) { // We are going to replay on top of "`seqno`.state" to create a new // "LATEST.state". Start off by creating a tempfile so we can later make the // new "LATEST.state" appear atomically using `RenameFile()`. - s = CopyFile(FileSystem::Default(), state_file_path, latest_file_temp_path, - 0 /* size */, false /* use_fsync */, nullptr /* io_tracer */, - Temperature::kUnknown); + s = CopyFile(FileSystem::Default(), state_file_path, Temperature::kUnknown, + latest_file_temp_path, Temperature::kUnknown, 0 /* size */, + false /* use_fsync */, nullptr /* io_tracer */); } { @@ -658,26 +661,26 @@ Status FileExpectedStateManager::Restore(DB* db) { if (s.ok()) { s = replayer->Prepare(); } - for (;;) { + for (; s.ok();) { std::unique_ptr record; s = replayer->Next(&record); if (!s.ok()) { + if (s.IsCorruption() && handler->IsDone()) { + // There could be a corruption reading the tail record of the trace + // due to `db_stress` crashing while writing it. It shouldn't matter + // as long as we already found all the write ops we need to catch up + // the expected state. + s = Status::OK(); + } + if (s.IsIncomplete()) { + // OK because `Status::Incomplete` is expected upon finishing all the + // trace records. + s = Status::OK(); + } break; } std::unique_ptr res; - record->Accept(handler.get(), &res); - } - if (s.IsCorruption() && handler->IsDone()) { - // There could be a corruption reading the tail record of the trace due to - // `db_stress` crashing while writing it. It shouldn't matter as long as - // we already found all the write ops we need to catch up the expected - // state. - s = Status::OK(); - } - if (s.IsIncomplete()) { - // OK because `Status::Incomplete` is expected upon finishing all the - // trace records. - s = Status::OK(); + s = record->Accept(handler.get(), &res); } } diff --git a/db_stress_tool/expected_value.h b/db_stress_tool/expected_value.h index 338afc04914..ad0ddf7b434 100644 --- a/db_stress_tool/expected_value.h +++ b/db_stress_tool/expected_value.h @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include @@ -150,6 +151,11 @@ class ExpectedValue { // `PendingExpectedValue` represents the expected value of a key undergoing a // pending operation in db stress. // +// After a `PendingExpectedValue` object is created, either `Rollback` or +// `Commit` should be called to close its pending state before it's destructed. +// In case no pending state was introduced while creating this +// `PendingExpectedValue` and user want to ignore the unclosed pending state, +// `PermitUnclosedPendingState` should be called explicitly. // This class is not thread-safe. class PendingExpectedValue { public: @@ -158,21 +164,85 @@ class PendingExpectedValue { ExpectedValue final_value) : value_ptr_(value_ptr), orig_value_(orig_value), - final_value_(final_value) {} + final_value_(final_value), + pending_state_closed_(false) {} + + PendingExpectedValue(const PendingExpectedValue& other) + : value_ptr_(other.value_ptr_), + orig_value_(other.orig_value_), + final_value_(other.final_value_), + pending_state_closed_(false) { + other.ClosePendingState(); + } + + PendingExpectedValue(PendingExpectedValue&& other) noexcept + : value_ptr_(std::move(other.value_ptr_)), + orig_value_(std::move(other.orig_value_)), + final_value_(std::move(other.final_value_)), + pending_state_closed_(false) { + other.ClosePendingState(); + } + + PendingExpectedValue& operator=(const PendingExpectedValue& other) { + if (this != &other) { + other.ClosePendingState(); + value_ptr_ = other.value_ptr_; + orig_value_ = other.orig_value_; + final_value_ = other.final_value_; + pending_state_closed_ = false; + } + return *this; + } + + PendingExpectedValue& operator=(PendingExpectedValue&& other) { + if (this != &other) { + other.ClosePendingState(); + value_ptr_ = std::move(other.value_ptr_); + orig_value_ = std::move(other.orig_value_); + final_value_ = std::move(other.final_value_); + pending_state_closed_ = false; + } + return *this; + } + + ~PendingExpectedValue() { assert(pending_state_closed_); } void Commit() { + assert(!pending_state_closed_); + ClosePendingState(); // To prevent low-level instruction reordering that results // in setting expected value happens before db write std::atomic_thread_fence(std::memory_order_release); value_ptr_->store(final_value_.Read()); } + // Rollbacks the key to its original state. + // This rollbacks the pending state created in `ExpectedState::Precommit`, + // such as pending delete, pending put. If `ExpectedState::Precommit()` is not + // called before creating this `PendingExpectedValue`, this is a no-op. + void Rollback() { + assert(!pending_state_closed_); + ClosePendingState(); + // To prevent low-level instruction reordering that results + // in setting expected value happens before db write + std::atomic_thread_fence(std::memory_order_release); + value_ptr_->store(orig_value_.Read()); + } + + void PermitUnclosedPendingState() const { + assert(!pending_state_closed_); + ClosePendingState(); + } + uint32_t GetFinalValueBase() { return final_value_.GetValueBase(); } private: - std::atomic* const value_ptr_; - const ExpectedValue orig_value_; - const ExpectedValue final_value_; + inline void ClosePendingState() const { pending_state_closed_ = true; } + + std::atomic* value_ptr_; + ExpectedValue orig_value_; + ExpectedValue final_value_; + mutable bool pending_state_closed_; }; // `ExpectedValueHelper` provides utils to parse `ExpectedValue` to obtain diff --git a/db_stress_tool/multi_ops_txns_stress.cc b/db_stress_tool/multi_ops_txns_stress.cc index c7d38339bb2..ee90711b1f7 100644 --- a/db_stress_tool/multi_ops_txns_stress.cc +++ b/db_stress_tool/multi_ops_txns_stress.cc @@ -150,7 +150,7 @@ std::string MultiOpsTxnsStressTest::Record::EncodePrimaryKey(uint32_t a) { PutFixed32(&ret, kPrimaryIndexId); PutFixed32(&ret, a); - char* const buf = &ret[0]; + char* const buf = ret.data(); std::reverse(buf, buf + sizeof(kPrimaryIndexId)); std::reverse(buf + sizeof(kPrimaryIndexId), buf + sizeof(kPrimaryIndexId) + sizeof(a)); @@ -162,7 +162,7 @@ std::string MultiOpsTxnsStressTest::Record::EncodeSecondaryKey(uint32_t c) { PutFixed32(&ret, kSecondaryIndexId); PutFixed32(&ret, c); - char* const buf = &ret[0]; + char* const buf = ret.data(); std::reverse(buf, buf + sizeof(kSecondaryIndexId)); std::reverse(buf + sizeof(kSecondaryIndexId), buf + sizeof(kSecondaryIndexId) + sizeof(c)); @@ -176,7 +176,7 @@ std::string MultiOpsTxnsStressTest::Record::EncodeSecondaryKey(uint32_t c, PutFixed32(&ret, c); PutFixed32(&ret, a); - char* const buf = &ret[0]; + char* const buf = ret.data(); std::reverse(buf, buf + sizeof(kSecondaryIndexId)); std::reverse(buf + sizeof(kSecondaryIndexId), buf + sizeof(kSecondaryIndexId) + sizeof(c)); @@ -373,10 +373,15 @@ Status MultiOpsTxnsStressTest::TestGet( ThreadState* thread, const ReadOptions& read_opts, const std::vector& /*rand_column_families*/, const std::vector& /*rand_keys*/) { + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); uint32_t a = 0; uint32_t pos = 0; std::tie(a, pos) = ChooseExistingA(thread); - return PointLookupTxn(thread, read_opts, a); + Status s = PointLookupTxn(thread, read_opts, a); + ThreadStatusUtil::SetThreadOperation(cur_op_type); + return s; } // Not used. @@ -416,10 +421,15 @@ Status MultiOpsTxnsStressTest::TestIterate( ThreadState* thread, const ReadOptions& read_opts, const std::vector& /*rand_column_families*/, const std::vector& /*rand_keys*/) { + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); uint32_t c = 0; uint32_t pos = 0; std::tie(c, pos) = ChooseExistingC(thread); - return RangeScanTxn(thread, read_opts, c); + Status s = RangeScanTxn(thread, read_opts, c); + ThreadStatusUtil::SetThreadOperation(cur_op_type); + return s; } // Not intended for use. @@ -1221,7 +1231,11 @@ void MultiOpsTxnsStressTest::VerifyPkSkFast(const ReadOptions& read_options, assert(db_ == db); assert(db_ != nullptr); + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); const Snapshot* const snapshot = db_->GetSnapshot(); + ThreadStatusUtil::SetThreadOperation(cur_op_type); assert(snapshot); ManagedSnapshot snapshot_guard(db_, snapshot); diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index 27a20fd5a5d..a94de38971d 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -17,9 +17,9 @@ namespace ROCKSDB_NAMESPACE { class NonBatchedOpsStressTest : public StressTest { public: - NonBatchedOpsStressTest() {} + NonBatchedOpsStressTest() = default; - virtual ~NonBatchedOpsStressTest() {} + virtual ~NonBatchedOpsStressTest() = default; void VerifyDb(ThreadState* thread) const override { // This `ReadOptions` is for validation purposes. Ignore @@ -582,7 +582,11 @@ class NonBatchedOpsStressTest : public StressTest { int error_count = 0; // Do a consistency check between Get and MultiGet. Don't do it too // often as it will slow db_stress down - bool do_consistency_check = thread->rand.OneIn(4); + // + // CompactionFilter can make snapshot non-repeatable by removing keys + // protected by snapshot + bool do_consistency_check = + !FLAGS_enable_compaction_filter && thread->rand.OneIn(4); ReadOptions readoptionscopy = read_opts; @@ -624,7 +628,7 @@ class NonBatchedOpsStressTest : public StressTest { if (!shared->AllowsOverwrite(rand_key) && shared->Exists(column_family, rand_key)) { // Just do read your write checks for keys that allow overwrites. - ryw_expected_values.push_back(std::nullopt); + ryw_expected_values.emplace_back(std::nullopt); continue; } // With a 1 in 10 probability, insert the just added key in the batch @@ -667,7 +671,7 @@ class NonBatchedOpsStressTest : public StressTest { thread->shared->SafeTerminate(); } } else { - ryw_expected_values.push_back(std::nullopt); + ryw_expected_values.emplace_back(std::nullopt); } } } @@ -796,18 +800,27 @@ class NonBatchedOpsStressTest : public StressTest { fprintf(stderr, "Get error: %s\n", s.ToString().c_str()); is_consistent = false; } else if (!s.ok() && tmp_s.ok()) { - fprintf(stderr, "MultiGet returned different results with key %s\n", - key.ToString(true).c_str()); + fprintf(stderr, + "MultiGet(%d) returned different results with key %s. " + "Snapshot Seq No: %" PRIu64 "\n", + column_family, key.ToString(true).c_str(), + readoptionscopy.snapshot->GetSequenceNumber()); fprintf(stderr, "Get returned ok, MultiGet returned not found\n"); is_consistent = false; } else if (s.ok() && tmp_s.IsNotFound()) { - fprintf(stderr, "MultiGet returned different results with key %s\n", - key.ToString(true).c_str()); + fprintf(stderr, + "MultiGet(%d) returned different results with key %s. " + "Snapshot Seq No: %" PRIu64 "\n", + column_family, key.ToString(true).c_str(), + readoptionscopy.snapshot->GetSequenceNumber()); fprintf(stderr, "MultiGet returned ok, Get returned not found\n"); is_consistent = false; } else if (s.ok() && value != expected_value.ToString()) { - fprintf(stderr, "MultiGet returned different results with key %s\n", - key.ToString(true).c_str()); + fprintf(stderr, + "MultiGet(%d) returned different results with key %s. " + "Snapshot Seq No: %" PRIu64 "\n", + column_family, key.ToString(true).c_str(), + readoptionscopy.snapshot->GetSequenceNumber()); fprintf(stderr, "MultiGet returned value %s\n", expected_value.ToString(true).c_str()); fprintf(stderr, "Get returned value %s\n", @@ -910,7 +923,18 @@ class NonBatchedOpsStressTest : public StressTest { PinnableWideColumns from_db; - const Status s = db_->GetEntity(read_opts, cfh, key, &from_db); + ReadOptions read_opts_copy = read_opts; + std::string read_ts_str; + Slice read_ts_slice; + if (FLAGS_user_timestamp_size > 0) { + read_ts_str = GetNowNanos(); + read_ts_slice = read_ts_str; + read_opts_copy.timestamp = &read_ts_slice; + } + bool read_older_ts = MaybeUseOlderTimestampForPointLookup( + thread, read_ts_str, read_ts_slice, read_opts_copy); + + const Status s = db_->GetEntity(read_opts_copy, cfh, key, &from_db); int error_count = 0; @@ -933,7 +957,7 @@ class NonBatchedOpsStressTest : public StressTest { thread->stats.AddGets(1, 1); - if (!FLAGS_skip_verifydb) { + if (!FLAGS_skip_verifydb && !read_older_ts) { const WideColumns& columns = from_db.columns(); ExpectedValue expected = shared->Get(rand_column_families[0], rand_keys[0]); @@ -956,7 +980,7 @@ class NonBatchedOpsStressTest : public StressTest { } else if (s.IsNotFound()) { thread->stats.AddGets(1, 0); - if (!FLAGS_skip_verifydb) { + if (!FLAGS_skip_verifydb && !read_older_ts) { ExpectedValue expected = shared->Get(rand_column_families[0], rand_keys[0]); if (ExpectedValueHelper::MustHaveExisted(expected, expected)) { @@ -1051,7 +1075,10 @@ class NonBatchedOpsStressTest : public StressTest { fault_fs_guard->DisableErrorInjection(); } - const bool check_get_entity = !error_count && thread->rand.OneIn(4); + // CompactionFilter can make snapshot non-repeatable by removing keys + // protected by snapshot + const bool check_get_entity = !FLAGS_enable_compaction_filter && + !error_count && thread->rand.OneIn(4); for (size_t i = 0; i < num_keys; ++i) { const Status& s = statuses[i]; @@ -1311,6 +1338,7 @@ class NonBatchedOpsStressTest : public StressTest { } if (!s.ok()) { + pending_expected_value.Rollback(); if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { is_db_stopped_ = true; @@ -1368,6 +1396,7 @@ class NonBatchedOpsStressTest : public StressTest { } if (!s.ok()) { + pending_expected_value.Rollback(); if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { @@ -1400,6 +1429,7 @@ class NonBatchedOpsStressTest : public StressTest { } if (!s.ok()) { + pending_expected_value.Rollback(); if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { @@ -1464,6 +1494,10 @@ class NonBatchedOpsStressTest : public StressTest { s = db_->DeleteRange(write_opts, cfh, key, end_key); } if (!s.ok()) { + for (PendingExpectedValue& pending_expected_value : + pending_expected_values) { + pending_expected_value.Rollback(); + } if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { is_db_stopped_ = true; @@ -1492,6 +1526,7 @@ class NonBatchedOpsStressTest : public StressTest { const std::string sst_filename = FLAGS_db + "/." + std::to_string(thread->tid) + ".sst"; Status s; + std::ostringstream ingest_options_oss; if (db_stress_env->FileExists(sst_filename).ok()) { // Maybe we terminated abnormally before, so cleanup to give this file // ingestion a clean slate @@ -1560,19 +1595,38 @@ class NonBatchedOpsStressTest : public StressTest { s = sst_file_writer.Finish(); } if (s.ok()) { + IngestExternalFileOptions ingest_options; + ingest_options.move_files = thread->rand.OneInOpt(2); + ingest_options.verify_checksums_before_ingest = thread->rand.OneInOpt(2); + ingest_options.verify_checksums_readahead_size = + thread->rand.OneInOpt(2) ? 1024 * 1024 : 0; + ingest_options_oss << "move_files: " << ingest_options.move_files + << ", verify_checksums_before_ingest: " + << ingest_options.verify_checksums_before_ingest + << ", verify_checksums_readahead_size: " + << ingest_options.verify_checksums_readahead_size; s = db_->IngestExternalFile(column_families_[column_family], - {sst_filename}, IngestExternalFileOptions()); + {sst_filename}, ingest_options); } if (!s.ok()) { + for (PendingExpectedValue& pending_expected_value : + pending_expected_values) { + pending_expected_value.Rollback(); + } if (!s.IsIOError() || !std::strstr(s.getState(), "injected")) { - fprintf(stderr, "file ingestion error: %s\n", s.ToString().c_str()); + fprintf(stderr, + "file ingestion error: %s under specified " + "IngestExternalFileOptions: %s (Empty string or " + "missing field indicates default option or value is used)\n", + s.ToString().c_str(), ingest_options_oss.str().c_str()); thread->shared->SafeTerminate(); } else { fprintf(stdout, "file ingestion error: %s\n", s.ToString().c_str()); } } else { - for (size_t i = 0; i < pending_expected_values.size(); ++i) { - pending_expected_values[i].Commit(); + for (PendingExpectedValue& pending_expected_value : + pending_expected_values) { + pending_expected_value.Commit(); } } } @@ -2037,12 +2091,17 @@ class NonBatchedOpsStressTest : public StressTest { Slice slice(value_from_db); uint32_t value_base = GetValueBase(slice); shared->SyncPut(cf, key, value_base); + return true; } else if (s.IsNotFound()) { // Value doesn't exist in db, update state to reflect that shared->SyncDelete(cf, key); + return true; } - return true; } + char expected_value_data[kValueMaxLen]; + size_t expected_value_data_size = + GenerateValue(expected_value.GetValueBase(), expected_value_data, + sizeof(expected_value_data)); // compare value_from_db with the value in the shared state if (s.ok()) { @@ -2054,10 +2113,6 @@ class NonBatchedOpsStressTest : public StressTest { key, value_from_db, ""); return false; } - char expected_value_data[kValueMaxLen]; - size_t expected_value_data_size = - GenerateValue(expected_value.GetValueBase(), expected_value_data, - sizeof(expected_value_data)); if (!ExpectedValueHelper::InExpectedValueBaseRange( value_base_from_db, expected_value, expected_value)) { VerificationAbort(shared, msg_prefix + ": Unexpected value found", cf, @@ -2084,17 +2139,16 @@ class NonBatchedOpsStressTest : public StressTest { } else if (s.IsNotFound()) { if (ExpectedValueHelper::MustHaveExisted(expected_value, expected_value)) { - char expected_value_data[kValueMaxLen]; - size_t expected_value_data_size = - GenerateValue(expected_value.GetValueBase(), expected_value_data, - sizeof(expected_value_data)); VerificationAbort( shared, msg_prefix + ": Value not found: " + s.ToString(), cf, key, "", Slice(expected_value_data, expected_value_data_size)); return false; } } else { - assert(false); + VerificationAbort(shared, msg_prefix + "Non-OK status: " + s.ToString(), + cf, key, "", + Slice(expected_value_data, expected_value_data_size)); + return false; } return true; } diff --git a/docs/_posts/2024-02-20-foreign-function-interface.markdown b/docs/_posts/2024-02-20-foreign-function-interface.markdown new file mode 100644 index 00000000000..da1dd26eaf2 --- /dev/null +++ b/docs/_posts/2024-02-20-foreign-function-interface.markdown @@ -0,0 +1,278 @@ +--- +title: Java Foreign Function Interface +layout: post +author: alanpaxton +category: blog +--- +# Java Foreign Function Interface (FFI) + +Evolved Binary has been working on several aspects of how the Java API to RocksDB can be improved. The recently introduced FFI features in Java provide significant opportunities for improving the API. We have investigated this through a prototype implementation. + +Java 19 introduced a new [FFI Preview](https://openjdk.org/jeps/424) which is described as *an API by which Java programs can interoperate with code and data outside of the Java runtime. By efficiently invoking foreign functions (i.e., code outside the JVM), and by safely accessing foreign memory (i.e., memory not managed by the JVM), the API enables Java programs to call native libraries and process native data without the brittleness and danger of JNI*. + +If the twin promises of efficiency and safety are realised, then using FFI as a mechanism to support a future RocksDB API may be of significant benefit. + + - Remove the complexity of `JNI` access to `C++ RocksDB` + - Improve RocksDB Java API performance + - Reduce the opportunity for coding errors in the RocksDB Java API + + Here's what we did. We have + + - created a prototype FFI branch + - updated the RocksDB Java build to use Java 19 + - implemented an `FFI Preview API` version of core RocksDB feature (`get()`) + - Extended the current JMH benchmarks to also benchmark the new FFI methods. Usefully, JNI and FFI can co-exist peacefully, so we use the existing RocksDB Java to do support work around the FFI-based `get()` implementation. + +## Implementation + +### How JNI Works + +`JNI` requires a preprocessing step during build/compilation to generate header files which are linked into by Pure Java code. `C++` implementations of the methods in the headers are implemented. Corresponding `native` methods are declared in Java and the whole is linked together. + +Code in the `C++` methods uses what amounts to a `JNI` library to access Java values and objects and to create Java objects in response. + +### How FFI Works + +`FFI` provides the facility for Java to call existing native (in our case C++) code from Pure Java without having to generate support files during compilation steps. `FFI` does support an external tool (`jextract`) which makes generating common boilerplate easier and less error prone, but we choose to start prototyping without it, in part better to understand how things really work. + +`FFI` does its job by providing 2 things +1. A model for allocating, reading and writing native memory and native structures within that memory +2. A model for discovering and calling native methods with parameters consisting of native memory references and/or values + +The called `C++` is invoked entirely natively. It does not have to access any Java objects to retrieve data it needs. Therefore existing packages in `C++` and other sufficiently low level languages can be called from `Java` without having to implement stubs in the `C++`. + +### Our Approach + +While we could in principle avoid writing any C++, C++ objects and classes are not easily defined in the FFI model, so to begin with it is easier to write some very simple `C`-like methods/stubs in C++ which can immediately call into the object-oriented core of RocksDB. We define structures with which to pass parameters to and receive results from the `C`-like method(s) we implement. + +#### `C++` Side + +The first method we implement is +```C +extern "C" int rocksdb_ffi_get_pinnable( + ROCKSDB_NAMESPACE::DB* db, ROCKSDB_NAMESPACE::ReadOptions* read_options, + ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf, rocksdb_input_slice_t* key, + rocksdb_pinnable_slice_t* value); +``` +our input structure is +```C +typedef struct rocksdb_input_slice { + const char* data; + size_t size; +} rocksdb_input_slice_t; +``` +and our output structure is a pinnable slice (of which more later) +```C +typedef struct rocksdb_pinnable_slice { + const char* data; + size_t size; + ROCKSDB_NAMESPACE::PinnableSlice* pinnable_slice; + bool is_pinned; +} rocksdb_pinnable_slice_t; +``` + +#### `Java` Side + + We implement an `FFIMethod` class to advertise a `java.lang.invoke.MethodHandle` for each of our helper stubs + + ```java + public static MethodHandle GetPinnable; // handle which refers to the rocksdb_ffi_get_pinnable method in C++ + public static MethodHandle ResetPinnable; // handle which refers to the rocksdb_ffi_reset_pinnable method in C++ +``` +We also implement an `FFILayout` class to describe each of the passed structures (`rocksdb_input_slice` , `rocksdb_pinnable_slice` and `rocksdb_output_slice`) in `Java` terms + + ```java + public static class InputSlice { + static final GroupLayout Layout = ... + static final VarHandle Data = ... + static final VarHandle Size = ... + }; + + public static class PinnableSlice { + static final GroupLayout Layout = ... + static final VarHandle Data = ... + static final VarHandle Size = ... + static final VarHandle IsPinned = ... + }; + + public static class OutputSlice { + static final GroupLayout Layout = ... + static final VarHandle Data = ... + static final VarHandle Size = ... + }; + ``` + + The `FFIDB` class, which implements the public Java FFI API methods, makes use of `FFIMethod` and `FFILayout` to make the code for each individual method as idiomatic and efficient as possible. This class also contains `java.lang.foreign.MemorySession` and `java.lang.foreign.SegmentAllocator` objects which control the lifetime of native memory sessions and allow us to allocate lifetime-limited native memory which can be written and read by Java, and passed to native methods. + + At the user level, we then present a method which wraps the details of use of `FFIMethod` and `FFILayout` to implement our single, core Java API `get()` method + + ```java + public GetPinnableSlice getPinnableSlice(final ReadOptions readOptions, + final ColumnFamilyHandle columnFamilyHandle, final MemorySegment keySegment, + final GetParams getParams) + ``` + +The flow of implementation of `getPinnableSlice()`, in common with any other core RocksDB FFI API method becomes: + 1. Allocate `MemorySegment`s for `C++` structures using `Layout`s from `FFILayout` + 2. Write to the allocated structures using `VarHandle`s from `FFILayout` + 3. Invoke the native method using the `MethodHandle` from `FFIMethod` and addresses of instantiated `MemorySegment`s, or value types, as parameters + 4. Read the call result and the output parameter(s), again using `VarHandle`s from `FFILayout` to perform the mapping. + +For the `getPinnableSlice()` method, on successful return from an invocation of `rocksdb_ffi_get()`, the `PinnableSlice` object will contain the `data` and `size` fields of a pinnable slice (see below) containing the requested value. A `MemorySegment` referring to the native memory of the pinnable slice is then constructed, and used by the client to retrieve the value in whatever fashion they choose. + +### Pinnable Slices + +RocksDB offers core (C++) API methods using the concept of a [`PinnableSlice`](http://rocksdb.org/blog/2017/08/24/pinnableslice.html) to return fetched data values while reducing copies to a minimum. We take advantage of this to base our central `get()` method(s) on `PinnableSlice`s. Methods mirroring the existing `JNI`-based API can then be implemented in pure Java by wrapping the core `getPinnableSlice()`. + +So we implement +```java +public record GetPinnableSlice(Status.Code code, Optional pinnableSlice) {} + +public GetPinnableSlice getPinnableSlice( + final ColumnFamilyHandle columnFamilyHandle, final byte[] key) +``` +and we wrap that to provide +```java +public record GetBytes(Status.Code code, byte[] value, long size) {} + +public GetBytes get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) +``` + +## Benchmark Results + +We extended existing RocksDB Java JNI benchmarks with new benchmarks based on FFI. Full benchmark run on Ubuntu, including new benchmarks. + +```bash +java --enable-preview --enable-native-access=ALL-UNNAMED -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar -p keyCount=100000 -p keySize=128 -p valueSize=4096,65536 -p columnFamilyTestType="no_column_family" -rf csv org.rocksdb.jmh.GetBenchmarks +``` + +![JNI vs FFI](/static/images/jni-ffi/jmh-result-fixed.png) + +### Discussion + +We have plotted the performance (more operations is better) of a selection of benchmarks, +```bash +q "select Benchmark,Score from ./plot/jmh-result-fixed.csv where \"Param: keyCount\"=100000 and \"Param: valueSize\"=65536 -d, -H +``` + + - JNI versions of benchmarks are previously implemented `jmh` benchmarks for measuring the performance of the current RocksDB Java interface. + - FFI versions of benchmarks are equivalent benchmarks (as far as possible) implemented using the FFI mechanisms. + + We can see that for all benchmarks which have equivalent FFI and JNI pairs, the JNI version is only very marginally faster. FFI has successfully optimized away most of the extra safety-checking of the new invocation mechanism. + + Our initial implementation of FFI benchmarks lagged the JNI benchmarks quite significantly, but we have received extremely helpful support from Maurizio Cimadamore of the Panama Dev team, to help us optimize the performance of our FFI implementation. We consider that the small remaining performance gap is a feature of the remaining extra bounds checking of FFI. + + For basic `get()` the result buffer is allocated by the method, so that there is a cost of allocation associated with each request. + - `ffiGet` vs `get` + - The JNI version is very marginally faster than FFI + + For preallocated `get()` where the result buffer is supplied to the method, we avoid an allocation of a fresh result buffer on each call, and the test recycles its result buffers. Then the same small difference persists + - JNI is very marginally faster than FFI + - `preallocatedGet()` is a lot faster than basic `get()` + + We implemented some methods where the key for the `get()` is randomized, so that any ordering effects can be accounted for. The same differences persisted. + + The FFI interface gives us a natural way to expose RocksDB's [pinnable slice](http://rocksdb.org/blog/2017/08/24/pinnableslice.html) mechanism. When we provide a benchmark which accesses the raw `PinnableSlice` API, as expected this is the fastest method of any; however we are not comparing like with like: + - `ffiGetPinnableSlice()` returns a handle to the RocksDB memory containing the slice, and presents that as an FFI `MemorySegment`. No copying of the memory in the segment occurs. + + As noted above, we implement the new FFI-based `get()` methods using the new FFI-based `getPinnableSlice()` method, and copying out the result. So the `ffiGet` and `ffiPreallocatedGet` benchmarks use this mechanism underneath. + + In an effort to discover whether using the Java APIs to copy from the pinnable slice backed `MemorySegment` was a problem, we implemented a separate `ffiGetOutputSlice()` benchmark which copies the result into a (Java allocated native memory) segment at the C++ side. + - `ffiGetOutputSlice()` is faster than `ffiPreallocatedGet()` and is in fact at least as fast as `preallocatedGet()`, which is an almost exact analogue in the JNI world. + + So it appears that we can build an FFI-based API with equal performance to the JNI-based one. + + Thinking about the (very small, but probably statistically significant) difference between our `ffiGetPinnableSlice()`-based FFI calls and the JNI-based calls, it is reasonable to expect that some of the cost is the extra FFI call to C++ to release the pinned slice as a separate operation. A null FFI method call is extremely fast, but it does take some time. + + - We would recommend looking again the performance of the FFI-based implementation when Panama is release post-Preview in Java 21. It seems that at least with Java 20 the performance is of our FFI benchmarks is not significantly different from that of the Java 19 version. + +### Copies versus Calls + +The second method call over the FFI boundary to release a pinnable slice has a cost. We compared the `ffiGetOutputSlice()` and `ffiGetPinnableSlice()` benchmarks in order to examine this cost. We ran it with a fixed ky size (128 bytes); the key size is likely to be pretty much irrelevant anyway; we varied the value size read from 16 bytes to 16k, and we found a crossover point between 1k and 4k for performance: + +![Plot](/static/images/jni-ffi/jmh-result-pinnable-vs-output-plot.png) + +- `ffiGetOutputSlice()` is faster when values read are 1k in size or smaller. The cost of an extra copy in the C++ side from the pinnable slice buffer into the supplied buffer allocated by Java Foreign Memory API is less than the cost of the extra call to release a pinnable slice. +- `ffiGetPinnableSlice()` is faster when values read are 4k in size, or larger. Consistent with intuition, the advantage grows with larger read values. + +The way that the RocksDB API is constructed means that of the 2 methods compared, `ffiGetOutputSlice()` will always make exactly 1 more copy than `ffiGetPinnableSlice()`. The underlying RocksDB C++ API will always copy into its own temporary buffer if it decides that it cannot pin an internal buffer, and that will be returned as the pinnable slice. There is a potential optimization where the temporary buffer could be replaced by an output buffer, such as that supplied by `ffiGetOutputSlice()`; in practice that is a hard fix to hack in. Its effectiveness depends on how often RocksDB fails to pin an internal buffer. + +A solution which either filled a buffer *or* returned a pinnable slice would give us the best of both worlds. + +## Other Conclusions + +### Build Processing + +- It is easier to implement an interface using FFI than JNI. No intermediate build processing or code generation steps were needed to implement this protoype. + +- For a production version, we would urge using `jextract` to automate the process of generating Java API methods from the set of supporting stubs we generate. + +### Safety + +- The use of `jextract` will give a similar level of type security to the use of JNI, when crossing the language boundary. But we do not believe FFI is significantly more type-safe than JNI for method invocation. Neither is it less safe, though. + +### Native Memory + +Panama's *Foreign-Memory Access API* appears to us to be the most significant part of the whole project. At the `Java` side of RocksDB it gives us a clean mechanism (a `MemorySegment`) for holding RocksDB data (e.g. as from the result of a `get()`) call pending its forwarding to client code or network buffers. + +We have taken advantage of this mechanism to provide the core `FFIDB.getPinnableSlice()` method in our Panama-based API. The rest of our prototype `get()` API, duplicating the existing *JNI*-based API, is then a *Pure Java* library on top of `FFIDB.getPinnableSlice()` and `FFIPinnableSlice.reset()`. + +The common standard for foreign memory opens up the possibility of efficient interoperation between RocksDB and Java clients (e.g. Kafka). We think that this is really the key to higher performing, more integrated Java-based systems: +- This could result in data never being copied into Java memory, or a significant reduction in copies, as native `MemorySegment`s are handed off between co-operating Java clients of fundamentally native APIs. This extra potential performance can be extremely useful when 2 or more clients are interoperating; we still need to provide a simplest possible API wrapping these calls (like our prototype `get()`), which operates at a similar level to the current Java API. +- Some thought should be applied to how this architecture would interact with the cache layer(s) in RocksDB, and whether it can be accommodated within the present RocksDB architecture. How long can 3rd-party applications *pin* pages in the RocksDB cache without disrupting RocksDB normal behaviour (e.g. compaction) ? + +## Summary + +1. Panama/FFI (in [Preview](https://openjdk.org/jeps/424)) is a highly capable technology for (re)building the RocksDB Java API, although the supported language level of RocksDB and the planned release schedule for Panama mean that it could not replace JNI in production for some time to come. +2. Panama/FFI would seem to offer comparable performance to JNI; there is no strong performance argument *for* a re-implementation of a standalone RocksDB Java API. But the opportunity to provide a natural pinnable slice-based API gives a lot of flexibility; not least because an efficient API could be built mostly in Java with only a small underlying layer implementing the pinnable slice interface. +3. Panama/FFI can remove some boilerplate (native method declarations) and allow Java programs to access `C` libraries without stub code, but calling a `C++`-based library still requires `C` stubs; a possible approach would be to use the RocksDB `C` API as the basis for a rebuilt Java API. This would allow us to remove all the existing JNI boilerplate, and concentrate support effort on the `C` API. An alternative approach would be to build a robust API based on [Reference Counting](https://github.com/facebook/rocksdb/pull/10736), but using FFI. +4. Panama/FFI really shines as a foreign memory standard for a Java API that can allow efficient interoperation between RocksDB Java clients and other (Java and native) components of a system. Foreign Memory gives us a model for how to efficiently return data from RocksDB; as pinnable slices with their contents presented in `MemorySegment`s. If we focus on designing an API *for native interoperability* we think this can be highly productive in opening RocksDB to new uses and opportunities in future. + +## Appendix + +### Code and Data + +The [Experimental Pull Request](https://github.com/facebook/rocksdb/pull/11095/files) contains the source code implemented, +together with further data plots and the source CSV files for all data plots. + +### Running + +This is an example run; the jmh parameters (after `-p`) can be changed to measure performance with varying key counts, and key and value sizes. +```bash +java --enable-preview --enable-native-access=ALL-UNNAMED -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar -p keyCount=100000 -p keySize=128 -p valueSize=4096,65536 -p columnFamilyTestType="no_column_family" -rf csv org.rocksdb.jmh.GetBenchmarks -wi 1 -to 1m -i 1 +``` + +### Processing + +Use [`q`](http://harelba.github.io/q/) to select the csv output for analysis and graphing. + + - Note that we edited the column headings for easier processing + +```bash +q "select Benchmark,Score,Error from ./plot/jmh-result.csv where keyCount=100000 and valueSize=65536" -d, -H -C readwrite +``` + +### Java 19 installation + +We followed the instructions to install [Azul](https://docs.azul.com/core/zulu-openjdk/install/debian). Then select the correct instance of java locally: +```bash +sudo update-alternatives --config java +sudo update-alternatives --config javac +``` +And set `JAVA_HOME` appropriately. In my case, `sudo update-alternatives --config java` listed a few JVMs thus: +``` + 0 /usr/lib/jvm/bellsoft-java8-full-amd64/bin/java 20803123 auto mode + 1 /usr/lib/jvm/bellsoft-java8-full-amd64/bin/java 20803123 manual mode + 2 /usr/lib/jvm/java-11-openjdk-amd64/bin/java 1111 manual mode +* 3 /usr/lib/jvm/zulu19/bin/java 2193001 manual mode +``` +For our environment, we set this: +```bash +export JAVA_HOME=/usr/lib/jvm/zulu19 +``` + +The default version of Maven avaiable on the Ubuntu package repositories (3.6.3) is incompatible with Java 19. You will need to install a later [Maven](https://maven.apache.org/install.html), and use it. I used `3.8.7` successfully. + +### Java 20, 21, 22 and subsequent versions + +The FFI version we used was a preview in Java 19, and the interface has changed through to Java 22, where it has been finalized. Future work with this prototype will need to update the code to use the changed interface. \ No newline at end of file diff --git a/docs/static/images/jni-ffi/jmh-result-fixed.png b/docs/static/images/jni-ffi/jmh-result-fixed.png new file mode 100644 index 00000000000..0d394dd1eb7 Binary files /dev/null and b/docs/static/images/jni-ffi/jmh-result-fixed.png differ diff --git a/docs/static/images/jni-ffi/jmh-result-pinnable-vs-output-plot.png b/docs/static/images/jni-ffi/jmh-result-pinnable-vs-output-plot.png new file mode 100644 index 00000000000..b5a24004b14 Binary files /dev/null and b/docs/static/images/jni-ffi/jmh-result-pinnable-vs-output-plot.png differ diff --git a/docs/static/images/jni-ffi/jmh-result-select.png b/docs/static/images/jni-ffi/jmh-result-select.png new file mode 100644 index 00000000000..e6999d0f848 Binary files /dev/null and b/docs/static/images/jni-ffi/jmh-result-select.png differ diff --git a/env/composite_env.cc b/env/composite_env.cc index 8ddc9a1a6cd..59434785ced 100644 --- a/env/composite_env.cc +++ b/env/composite_env.cc @@ -504,7 +504,7 @@ EnvWrapper::EnvWrapper(const std::shared_ptr& t) : target_(t) { RegisterOptions("", &target_, &env_wrapper_type_info); } -EnvWrapper::~EnvWrapper() {} +EnvWrapper::~EnvWrapper() = default; Status EnvWrapper::PrepareOptions(const ConfigOptions& options) { target_.Prepare(); diff --git a/env/emulated_clock.h b/env/emulated_clock.h index 62273763503..a05ebdd3ada 100644 --- a/env/emulated_clock.h +++ b/env/emulated_clock.h @@ -34,7 +34,7 @@ class EmulatedSystemClock : public SystemClockWrapper { static const char* kClassName() { return "TimeEmulatedSystemClock"; } const char* Name() const override { return kClassName(); } - virtual void SleepForMicroseconds(int micros) override { + void SleepForMicroseconds(int micros) override { sleep_counter_++; if (no_slowdown_ || time_elapse_only_sleep_) { addon_microseconds_.fetch_add(micros); @@ -70,7 +70,7 @@ class EmulatedSystemClock : public SystemClockWrapper { int GetSleepCounter() const { return sleep_counter_.load(); } - virtual Status GetCurrentTime(int64_t* unix_time) override { + Status GetCurrentTime(int64_t* unix_time) override { Status s; if (time_elapse_only_sleep_) { *unix_time = maybe_starting_time_; @@ -84,22 +84,22 @@ class EmulatedSystemClock : public SystemClockWrapper { return s; } - virtual uint64_t CPUNanos() override { + uint64_t CPUNanos() override { cpu_counter_++; return SystemClockWrapper::CPUNanos(); } - virtual uint64_t CPUMicros() override { + uint64_t CPUMicros() override { cpu_counter_++; return SystemClockWrapper::CPUMicros(); } - virtual uint64_t NowNanos() override { + uint64_t NowNanos() override { return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowNanos()) + addon_microseconds_.load() * 1000; } - virtual uint64_t NowMicros() override { + uint64_t NowMicros() override { return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowMicros()) + addon_microseconds_.load(); } diff --git a/env/env.cc b/env/env.cc index 948765bb916..6eeb02eb302 100644 --- a/env/env.cc +++ b/env/env.cc @@ -359,7 +359,7 @@ class LegacyFileSystemWrapper : public FileSystem { public: // Initialize an EnvWrapper that delegates all calls to *t explicit LegacyFileSystemWrapper(Env* t) : target_(t) {} - ~LegacyFileSystemWrapper() override {} + ~LegacyFileSystemWrapper() override = default; static const char* kClassName() { return "LegacyFileSystem"; } const char* Name() const override { return kClassName(); } @@ -624,7 +624,7 @@ Env::Env(const std::shared_ptr& fs, const std::shared_ptr& clock) : thread_status_updater_(nullptr), file_system_(fs), system_clock_(clock) {} -Env::~Env() {} +Env::~Env() = default; Status Env::NewLogger(const std::string& fname, std::shared_ptr* result) { @@ -797,7 +797,7 @@ std::string Env::GenerateUniqueId() { // Use 36 character format of RFC 4122 result.resize(36U); - char* buf = &result[0]; + char* buf = result.data(); PutBaseChars<16>(&buf, 8, upper >> 32, /*!uppercase*/ false); *(buf++) = '-'; PutBaseChars<16>(&buf, 4, upper >> 16, /*!uppercase*/ false); @@ -817,15 +817,15 @@ std::string Env::GenerateUniqueId() { return result; } -SequentialFile::~SequentialFile() {} +SequentialFile::~SequentialFile() = default; -RandomAccessFile::~RandomAccessFile() {} +RandomAccessFile::~RandomAccessFile() = default; -WritableFile::~WritableFile() {} +WritableFile::~WritableFile() = default; -MemoryMappedFileBuffer::~MemoryMappedFileBuffer() {} +MemoryMappedFileBuffer::~MemoryMappedFileBuffer() = default; -Logger::~Logger() {} +Logger::~Logger() = default; Status Logger::Close() { if (!closed_) { @@ -838,7 +838,7 @@ Status Logger::Close() { Status Logger::CloseImpl() { return Status::NotSupported(); } -FileLock::~FileLock() {} +FileLock::~FileLock() = default; void LogFlush(Logger* info_log) { if (info_log) { @@ -1051,9 +1051,10 @@ void Log(const std::shared_ptr& info_log, const char* format, ...) { } Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname, - bool should_sync) { + bool should_sync, const IOOptions* io_options) { const auto& fs = env->GetFileSystem(); - return WriteStringToFile(fs.get(), data, fname, should_sync); + return WriteStringToFile(fs.get(), data, fname, should_sync, + io_options ? *io_options : IOOptions()); } Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc index 93bb2dba0eb..6a3b0390af2 100644 --- a/env/env_basic_test.cc +++ b/env/env_basic_test.cc @@ -342,7 +342,7 @@ TEST_P(EnvMoreTestWithParam, GetChildren) { ASSERT_OK(env_->GetChildrenFileAttributes(test_dir_, &childAttr)); ASSERT_EQ(3U, children.size()); ASSERT_EQ(3U, childAttr.size()); - for (auto each : children) { + for (const auto& each : children) { env_->DeleteDir(test_dir_ + "/" + each).PermitUncheckedError(); } // necessary for default POSIX env diff --git a/env/env_chroot.cc b/env/env_chroot.cc index 5ff32a7e444..93dd7acd1fc 100644 --- a/env/env_chroot.cc +++ b/env/env_chroot.cc @@ -7,10 +7,11 @@ #include "env/env_chroot.h" -#include // errno -#include // realpath, free #include // geteuid +#include // errno +#include // realpath, free + #include "env/composite_env_wrapper.h" #include "env/fs_remap.h" #include "rocksdb/utilities/options_type.h" diff --git a/env/env_encryption.cc b/env/env_encryption.cc index 7b2a531c424..ad7583db29f 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -117,7 +117,7 @@ IOStatus EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n, size_t EncryptedRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { return file_->GetUniqueId(id, max_size); -}; +} void EncryptedRandomAccessFile::Hint(AccessPattern pattern) { file_->Hint(pattern); @@ -827,7 +827,7 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data, AllocateScratch(scratch); // Encrypt individual blocks. - while (1) { + while (true) { char* block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { @@ -871,7 +871,7 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char* data, AllocateScratch(scratch); // Decrypt individual blocks. - while (1) { + while (true) { char* block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { diff --git a/env/env_encryption_ctr.h b/env/env_encryption_ctr.h index b4342f7012d..21abbe231d8 100644 --- a/env/env_encryption_ctr.h +++ b/env/env_encryption_ctr.h @@ -24,8 +24,8 @@ class CTRCipherStream final : public BlockAccessCipherStream { public: CTRCipherStream(const std::shared_ptr& c, const char* iv, uint64_t initialCounter) - : cipher_(c), iv_(iv, c->BlockSize()), initialCounter_(initialCounter){}; - virtual ~CTRCipherStream(){}; + : cipher_(c), iv_(iv, c->BlockSize()), initialCounter_(initialCounter){} + virtual ~CTRCipherStream(){} size_t BlockSize() override { return cipher_->BlockSize(); } diff --git a/env/env_posix.cc b/env/env_posix.cc index ae2f9036028..8b24a7a2788 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -14,19 +14,21 @@ #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION #include #endif -#include #include +#include + #if defined(ROCKSDB_IOURING_PRESENT) #include #endif #include -#include -#include -#include -#include #include #include + +#include +#include +#include +#include #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID) #include #endif @@ -36,10 +38,10 @@ #if defined(ROCKSDB_IOURING_PRESENT) #include #endif -#include #include #include +#include // Get nano time includes #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) #elif defined(__MACH__) @@ -84,9 +86,9 @@ namespace ROCKSDB_NAMESPACE { #if defined(OS_WIN) static const std::string kSharedLibExt = ".dll"; -static const char kPathSeparator = ';'; +[[maybe_unused]] static const char kPathSeparator = ';'; #else -static const char kPathSeparator = ':'; +[[maybe_unused]] static const char kPathSeparator = ':'; #if defined(OS_MACOSX) static const std::string kSharedLibExt = ".dylib"; #else @@ -199,7 +201,7 @@ class PosixClock : public SystemClock { std::string dummy; dummy.reserve(maxsize); dummy.resize(maxsize); - char* p = &dummy[0]; + char* p = dummy.data(); port::LocalTimeR(&seconds, &t); snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec); @@ -463,7 +465,7 @@ struct StartThreadState { }; static void* StartThreadWrapper(void* arg) { - StartThreadState* state = reinterpret_cast(arg); + StartThreadState* state = static_cast(arg); state->user_function(state->arg); delete state; return nullptr; diff --git a/env/env_test.cc b/env/env_test.cc index 1bd176fb0b0..ecce0d29539 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -26,13 +26,14 @@ #ifdef OS_LINUX #include #include -#include #include #include + +#include #endif #ifdef ROCKSDB_FALLOCATE_PRESENT -#include +#include #endif #include "db/db_impl/db_impl.h" @@ -238,13 +239,11 @@ TEST_F(EnvPosixTest, LowerThreadPoolCpuPriority) { std::atomic from_priority(CpuPriority::kNormal); std::atomic to_priority(CpuPriority::kNormal); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "ThreadPoolImpl::BGThread::BeforeSetCpuPriority", [&](void* pri) { - from_priority.store(*reinterpret_cast(pri)); - }); + "ThreadPoolImpl::BGThread::BeforeSetCpuPriority", + [&](void* pri) { from_priority.store(*static_cast(pri)); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "ThreadPoolImpl::BGThread::AfterSetCpuPriority", [&](void* pri) { - to_priority.store(*reinterpret_cast(pri)); - }); + "ThreadPoolImpl::BGThread::AfterSetCpuPriority", + [&](void* pri) { to_priority.store(*static_cast(pri)); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); env_->SetBackgroundThreads(1, Env::BOTTOM); @@ -446,7 +445,7 @@ TEST_P(EnvPosixTestWithParam, RunMany) { CB(std::atomic* p, int i) : last_id_ptr(p), id(i) {} static void Run(void* v) { - CB* cb = reinterpret_cast(v); + CB* cb = static_cast(v); int cur = cb->last_id_ptr->load(); ASSERT_EQ(cb->id - 1, cur); cb->last_id_ptr->store(cb->id); @@ -483,7 +482,7 @@ struct State { }; static void ThreadBody(void* arg) { - State* s = reinterpret_cast(arg); + State* s = static_cast(arg); s->mu.Lock(); s->val += 1; s->num_running -= 1; @@ -530,7 +529,7 @@ TEST_P(EnvPosixTestWithParam, TwoPools) { should_start_(_should_start) {} static void Run(void* v) { - CB* cb = reinterpret_cast(v); + CB* cb = static_cast(v); cb->Run(); } @@ -2609,7 +2608,7 @@ TEST_F(EnvTest, IsDirectory) { FileOptions(), SystemClock::Default().get())); constexpr char buf[] = "test"; - s = fwriter->Append(buf); + s = fwriter->Append(IOOptions(), buf); ASSERT_OK(s); } ASSERT_OK(Env::Default()->IsDirectory(test_file_path, &is_dir)); @@ -2955,7 +2954,7 @@ struct NoDuplicateMiniStressTest { NoDuplicateMiniStressTest() { env = Env::Default(); } - virtual ~NoDuplicateMiniStressTest() {} + virtual ~NoDuplicateMiniStressTest() = default; void Run() { std::array threads; @@ -3448,7 +3447,7 @@ TEST_F(CreateEnvTest, CreateCompositeEnv) { class ReadAsyncFS; struct MockIOHandle { - std::function cb; + std::function cb; void* cb_arg; bool create_io_error; }; @@ -3463,7 +3462,7 @@ class ReadAsyncRandomAccessFile : public FSRandomAccessFileOwnerWrapper { : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {} IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, - std::function cb, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override; @@ -3515,7 +3514,7 @@ class ReadAsyncFS : public FileSystemWrapper { IOStatus ReadAsyncRandomAccessFile::ReadAsync( FSReadRequest& req, const IOOptions& opts, - std::function cb, void* cb_arg, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) { IOHandleDeleter deletefn = [](void* args) -> void { delete (static_cast(args)); @@ -3603,8 +3602,8 @@ TEST_F(TestAsyncRead, ReadAsync) { } // callback function passed to async read. - std::function callback = - [&](const FSReadRequest& req, void* cb_arg) { + std::function callback = + [&](FSReadRequest& req, void* cb_arg) { assert(cb_arg != nullptr); size_t i = *(reinterpret_cast(cb_arg)); reqs[i].offset = req.offset; diff --git a/env/file_system.cc b/env/file_system.cc index 71fb4d5bc74..27c7207f0f5 100644 --- a/env/file_system.cc +++ b/env/file_system.cc @@ -22,9 +22,9 @@ namespace ROCKSDB_NAMESPACE { -FileSystem::FileSystem() {} +FileSystem::FileSystem() = default; -FileSystem::~FileSystem() {} +FileSystem::~FileSystem() = default; static int RegisterBuiltinFileSystems(ObjectLibrary& library, const std::string& /*arg*/) { @@ -180,19 +180,20 @@ FileOptions FileSystem::OptimizeForBlobFileRead( } IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, - const std::string& fname, bool should_sync) { + const std::string& fname, bool should_sync, + const IOOptions& io_options) { std::unique_ptr file; EnvOptions soptions; IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr); if (!s.ok()) { return s; } - s = file->Append(data, IOOptions(), nullptr); + s = file->Append(data, io_options, nullptr); if (s.ok() && should_sync) { - s = file->Sync(IOOptions(), nullptr); + s = file->Sync(io_options, nullptr); } if (!s.ok()) { - fs->DeleteFile(fname, IOOptions(), nullptr); + fs->DeleteFile(fname, io_options, nullptr); } return s; } diff --git a/env/file_system_tracer.cc b/env/file_system_tracer.cc index d0c45c57eee..dc44107b58c 100644 --- a/env/file_system_tracer.cc +++ b/env/file_system_tracer.cc @@ -340,7 +340,7 @@ IOStatus FSRandomAccessFileTracingWrapper::InvalidateCache(size_t offset, IOStatus FSRandomAccessFileTracingWrapper::ReadAsync( FSReadRequest& req, const IOOptions& opts, - std::function cb, void* cb_arg, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) { // Create a callback and populate info. auto read_async_callback = @@ -361,8 +361,8 @@ IOStatus FSRandomAccessFileTracingWrapper::ReadAsync( return s; } -void FSRandomAccessFileTracingWrapper::ReadAsyncCallback( - const FSReadRequest& req, void* cb_arg) { +void FSRandomAccessFileTracingWrapper::ReadAsyncCallback(FSReadRequest& req, + void* cb_arg) { ReadAsyncCallbackInfo* read_async_cb_info = static_cast(cb_arg); assert(read_async_cb_info); diff --git a/env/file_system_tracer.h b/env/file_system_tracer.h index 979a0bf1203..7502496dc16 100644 --- a/env/file_system_tracer.h +++ b/env/file_system_tracer.h @@ -229,11 +229,11 @@ class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileOwnerWrapper { IOStatus InvalidateCache(size_t offset, size_t length) override; IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, - std::function cb, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override; - void ReadAsyncCallback(const FSReadRequest& req, void* cb_arg); + void ReadAsyncCallback(FSReadRequest& req, void* cb_arg); private: std::shared_ptr io_tracer_; @@ -243,7 +243,7 @@ class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileOwnerWrapper { struct ReadAsyncCallbackInfo { uint64_t start_time_; - std::function cb_; + std::function cb_; void* cb_arg_; std::string file_op_; }; diff --git a/env/fs_posix.cc b/env/fs_posix.cc index dd2f749350d..6d95d9a2eea 100644 --- a/env/fs_posix.cc +++ b/env/fs_posix.cc @@ -13,16 +13,17 @@ #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION #include #endif -#include #include #include -#include -#include -#include -#include #include #include #include + +#include +#include +#include +#include +#include #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID) #include #include @@ -30,9 +31,9 @@ #include #include #include -#include #include +#include // Get nano time includes #if defined(OS_LINUX) || defined(OS_FREEBSD) #elif defined(__MACH__) @@ -115,7 +116,7 @@ class PosixFileLock : public FileLock { filename.clear(); } - virtual ~PosixFileLock() override { + ~PosixFileLock() override { // Check for destruction without UnlockFile assert(fd_ == -1); } @@ -143,7 +144,7 @@ class PosixFileSystem : public FileSystem { const char* Name() const override { return kClassName(); } const char* NickName() const override { return kDefaultName(); } - ~PosixFileSystem() override {} + ~PosixFileSystem() override = default; bool IsInstanceOf(const std::string& name) const override { if (name == "posix") { return true; @@ -805,7 +806,7 @@ class PosixFileSystem : public FileSystem { IOStatus UnlockFile(FileLock* lock, const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { - PosixFileLock* my_lock = reinterpret_cast(lock); + PosixFileLock* my_lock = static_cast(lock); IOStatus result; mutex_locked_files.Lock(); // If we are unlocking, then verify that we had locked it earlier, @@ -997,16 +998,14 @@ class PosixFileSystem : public FileSystem { } #endif // ROCKSDB_IOURING_PRESENT - // EXPERIMENTAL - // - // TODO akankshamahajan: + // TODO: // 1. Update Poll API to take into account min_completions // and returns if number of handles in io_handles (any order) completed is // equal to atleast min_completions. // 2. Currently in case of direct_io, Read API is called because of which call // to Poll API fails as it expects IOHandle to be populated. - virtual IOStatus Poll(std::vector& io_handles, - size_t /*min_completions*/) override { + IOStatus Poll(std::vector& io_handles, + size_t /*min_completions*/) override { #if defined(ROCKSDB_IOURING_PRESENT) // io_uring_queue_init. struct io_uring* iu = nullptr; @@ -1078,7 +1077,7 @@ class PosixFileSystem : public FileSystem { #endif } - virtual IOStatus AbortIO(std::vector& io_handles) override { + IOStatus AbortIO(std::vector& io_handles) override { #if defined(ROCKSDB_IOURING_PRESENT) // io_uring_queue_init. struct io_uring* iu = nullptr; diff --git a/env/io_posix.cc b/env/io_posix.cc index 0ec0e9c83b4..29efb055bae 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -10,23 +10,24 @@ #ifdef ROCKSDB_LIB_IO_POSIX #include "env/io_posix.h" -#include #include #include +#include #if defined(OS_LINUX) #include #ifndef FALLOC_FL_KEEP_SIZE #include #endif #endif -#include -#include -#include #include #include #include #include + +#include +#include +#include #ifdef OS_LINUX #include #include @@ -437,7 +438,7 @@ void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize( size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname, int fd) { - std::string dir = fname.substr(0, fname.find_last_of("/")); + std::string dir = fname.substr(0, fname.find_last_of('/')); if (dir.empty()) { dir = "/"; } @@ -654,7 +655,9 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size(); // If requests exceed depth, split it into batches - if (this_reqs > kIoUringDepth) this_reqs = kIoUringDepth; + if (this_reqs > kIoUringDepth) { + this_reqs = kIoUringDepth; + } assert(incomplete_rq_list.size() <= this_reqs); for (size_t i = 0; i < this_reqs; i++) { @@ -854,7 +857,7 @@ IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) { IOStatus PosixRandomAccessFile::ReadAsync( FSReadRequest& req, const IOOptions& /*opts*/, - std::function cb, void* cb_arg, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* /*dbg*/) { if (use_direct_io()) { assert(IsSectorAligned(req.offset, GetRequiredBufferAlignment())); @@ -965,7 +968,7 @@ IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n, } else if (offset + n > length_) { n = static_cast(length_ - offset); } - *result = Slice(reinterpret_cast(mmapped_region_) + offset, n); + *result = Slice(static_cast(mmapped_region_) + offset, n); return s; } @@ -1064,7 +1067,7 @@ IOStatus PosixMmapFile::MapNewRegion() { } TEST_KILL_RANDOM("PosixMmapFile::Append:2"); - base_ = reinterpret_cast(ptr); + base_ = static_cast(ptr); limit_ = base_ + map_size_; dst_ = base_; last_sync_ = base_; diff --git a/env/io_posix.h b/env/io_posix.h index 8c51ba6450c..603af2f885a 100644 --- a/env/io_posix.h +++ b/env/io_posix.h @@ -76,8 +76,8 @@ inline bool IsSectorAligned(const void* ptr, size_t sector_size) { #if defined(ROCKSDB_IOURING_PRESENT) struct Posix_IOHandle { Posix_IOHandle(struct io_uring* _iu, - std::function _cb, - void* _cb_arg, uint64_t _offset, size_t _len, char* _scratch, + std::function _cb, void* _cb_arg, + uint64_t _offset, size_t _len, char* _scratch, bool _use_direct_io, size_t _alignment) : iu(_iu), cb(_cb), @@ -92,7 +92,7 @@ struct Posix_IOHandle { struct iovec iov; struct io_uring* iu; - std::function cb; + std::function cb; void* cb_arg; uint64_t offset; size_t len; @@ -246,15 +246,15 @@ class PosixSequentialFile : public FSSequentialFile { size_t logical_block_size, const EnvOptions& options); virtual ~PosixSequentialFile(); - virtual IOStatus Read(size_t n, const IOOptions& opts, Slice* result, - char* scratch, IODebugContext* dbg) override; - virtual IOStatus PositionedRead(uint64_t offset, size_t n, - const IOOptions& opts, Slice* result, - char* scratch, IODebugContext* dbg) override; - virtual IOStatus Skip(uint64_t n) override; - virtual IOStatus InvalidateCache(size_t offset, size_t length) override; - virtual bool use_direct_io() const override { return use_direct_io_; } - virtual size_t GetRequiredBufferAlignment() const override { + IOStatus Read(size_t n, const IOOptions& opts, Slice* result, char* scratch, + IODebugContext* dbg) override; + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& opts, + Slice* result, char* scratch, + IODebugContext* dbg) override; + IOStatus Skip(uint64_t n) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; + bool use_direct_io() const override { return use_direct_io_; } + size_t GetRequiredBufferAlignment() const override { return logical_sector_size_; } }; @@ -299,31 +299,30 @@ class PosixRandomAccessFile : public FSRandomAccessFile { ); virtual ~PosixRandomAccessFile(); - virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, - Slice* result, char* scratch, - IODebugContext* dbg) const override; + IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, Slice* result, + char* scratch, IODebugContext* dbg) const override; - virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, - const IOOptions& options, - IODebugContext* dbg) override; + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override; - virtual IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& opts, - IODebugContext* dbg) override; + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& opts, + IODebugContext* dbg) override; #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX) - virtual size_t GetUniqueId(char* id, size_t max_size) const override; + size_t GetUniqueId(char* id, size_t max_size) const override; #endif - virtual void Hint(AccessPattern pattern) override; - virtual IOStatus InvalidateCache(size_t offset, size_t length) override; - virtual bool use_direct_io() const override { return use_direct_io_; } - virtual size_t GetRequiredBufferAlignment() const override { + void Hint(AccessPattern pattern) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; + bool use_direct_io() const override { return use_direct_io_; } + size_t GetRequiredBufferAlignment() const override { return logical_sector_size_; } - // EXPERIMENTAL - virtual IOStatus ReadAsync( - FSReadRequest& req, const IOOptions& opts, - std::function cb, void* cb_arg, - void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override; + + virtual IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, + std::function cb, + void* cb_arg, void** io_handle, + IOHandleDeleter* del_fn, + IODebugContext* dbg) override; }; class PosixWritableFile : public FSWritableFile { @@ -351,47 +350,44 @@ class PosixWritableFile : public FSWritableFile { // Need to implement this so the file is truncated correctly // with direct I/O - virtual IOStatus Truncate(uint64_t size, const IOOptions& opts, - IODebugContext* dbg) override; - virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; - virtual IOStatus Append(const Slice& data, const IOOptions& opts, - IODebugContext* dbg) override; - virtual IOStatus Append(const Slice& data, const IOOptions& opts, - const DataVerificationInfo& /* verification_info */, - IODebugContext* dbg) override { + IOStatus Truncate(uint64_t size, const IOOptions& opts, + IODebugContext* dbg) override; + IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& opts, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { return Append(data, opts, dbg); } - virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset, - const IOOptions& opts, - IODebugContext* dbg) override; - virtual IOStatus PositionedAppend( - const Slice& data, uint64_t offset, const IOOptions& opts, - const DataVerificationInfo& /* verification_info */, - IODebugContext* dbg) override { + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& opts, + IODebugContext* dbg) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { return PositionedAppend(data, offset, opts, dbg); } - virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; - virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; - virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; - virtual bool IsSyncThreadSafe() const override; - virtual bool use_direct_io() const override { return use_direct_io_; } - virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override; - virtual uint64_t GetFileSize(const IOOptions& opts, - IODebugContext* dbg) override; - virtual IOStatus InvalidateCache(size_t offset, size_t length) override; - virtual size_t GetRequiredBufferAlignment() const override { + IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + bool IsSyncThreadSafe() const override; + bool use_direct_io() const override { return use_direct_io_; } + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override; + uint64_t GetFileSize(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; + size_t GetRequiredBufferAlignment() const override { return logical_sector_size_; } #ifdef ROCKSDB_FALLOCATE_PRESENT - virtual IOStatus Allocate(uint64_t offset, uint64_t len, - const IOOptions& opts, - IODebugContext* dbg) override; + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& opts, + IODebugContext* dbg) override; #endif - virtual IOStatus RangeSync(uint64_t offset, uint64_t nbytes, - const IOOptions& opts, - IODebugContext* dbg) override; + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& opts, + IODebugContext* dbg) override; #ifdef OS_LINUX - virtual size_t GetUniqueId(char* id, size_t max_size) const override; + size_t GetUniqueId(char* id, size_t max_size) const override; #endif }; @@ -449,28 +445,26 @@ class PosixMmapFile : public FSWritableFile { // Means Close() will properly take care of truncate // and it does not need any additional information - virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*opts*/, - IODebugContext* /*dbg*/) override { + IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { return IOStatus::OK(); } - virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; - virtual IOStatus Append(const Slice& data, const IOOptions& opts, - IODebugContext* dbg) override; - virtual IOStatus Append(const Slice& data, const IOOptions& opts, - const DataVerificationInfo& /* verification_info */, - IODebugContext* dbg) override { + IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& opts, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { return Append(data, opts, dbg); } - virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; - virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; - virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; - virtual uint64_t GetFileSize(const IOOptions& opts, - IODebugContext* dbg) override; - virtual IOStatus InvalidateCache(size_t offset, size_t length) override; + IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + uint64_t GetFileSize(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; #ifdef ROCKSDB_FALLOCATE_PRESENT - virtual IOStatus Allocate(uint64_t offset, uint64_t len, - const IOOptions& opts, - IODebugContext* dbg) override; + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& opts, + IODebugContext* dbg) override; #endif }; @@ -480,17 +474,16 @@ class PosixRandomRWFile : public FSRandomRWFile { const EnvOptions& options); virtual ~PosixRandomRWFile(); - virtual IOStatus Write(uint64_t offset, const Slice& data, - const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& opts, + IODebugContext* dbg) override; - virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, - Slice* result, char* scratch, - IODebugContext* dbg) const override; + IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, Slice* result, + char* scratch, IODebugContext* dbg) const override; - virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; - virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; - virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; - virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; private: const std::string filename_; @@ -507,11 +500,11 @@ class PosixDirectory : public FSDirectory { public: explicit PosixDirectory(int fd, const std::string& directory_name); ~PosixDirectory(); - virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; - virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; + IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; - virtual IOStatus FsyncWithDirOptions( + IOStatus FsyncWithDirOptions( const IOOptions&, IODebugContext*, const DirFsyncOptions& dir_fsync_options) override; diff --git a/env/mock_env.cc b/env/mock_env.cc index c232af61eb5..e206593a2a0 100644 --- a/env/mock_env.cc +++ b/env/mock_env.cc @@ -483,7 +483,7 @@ class TestMemLogger : public Logger { options_(options), dbg_(dbg), flush_pending_(false) {} - ~TestMemLogger() override {} + ~TestMemLogger() override = default; void Flush() override { if (flush_pending_) { diff --git a/env/mock_env_test.cc b/env/mock_env_test.cc index be174bd73d2..23c4baa1276 100644 --- a/env/mock_env_test.cc +++ b/env/mock_env_test.cc @@ -37,28 +37,28 @@ TEST_F(MockEnvTest, Corrupt) { Slice result; std::unique_ptr rand_file; ASSERT_OK(env_->NewRandomAccessFile(kFileName, &rand_file, soptions_)); - ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0]))); + ASSERT_OK(rand_file->Read(0, kGood.size(), &result, scratch.data())); ASSERT_EQ(result.compare(kGood), 0); // Sync + corrupt => no change ASSERT_OK(writable_file->Fsync()); ASSERT_OK(dynamic_cast(env_)->CorruptBuffer(kFileName)); result.clear(); - ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0]))); + ASSERT_OK(rand_file->Read(0, kGood.size(), &result, scratch.data())); ASSERT_EQ(result.compare(kGood), 0); // Add new data and corrupt it ASSERT_OK(writable_file->Append(kCorrupted)); ASSERT_TRUE(writable_file->GetFileSize() == kGood.size() + kCorrupted.size()); result.clear(); - ASSERT_OK( - rand_file->Read(kGood.size(), kCorrupted.size(), &result, &(scratch[0]))); + ASSERT_OK(rand_file->Read(kGood.size(), kCorrupted.size(), &result, + scratch.data())); ASSERT_EQ(result.compare(kCorrupted), 0); // Corrupted ASSERT_OK(dynamic_cast(env_)->CorruptBuffer(kFileName)); result.clear(); - ASSERT_OK( - rand_file->Read(kGood.size(), kCorrupted.size(), &result, &(scratch[0]))); + ASSERT_OK(rand_file->Read(kGood.size(), kCorrupted.size(), &result, + scratch.data())); ASSERT_NE(result.compare(kCorrupted), 0); } diff --git a/examples/compact_files_example.cc b/examples/compact_files_example.cc index 544adf8ae6a..52b054002d7 100644 --- a/examples/compact_files_example.cc +++ b/examples/compact_files_example.cc @@ -117,8 +117,7 @@ class FullCompactor : public Compactor { } static void CompactFiles(void* arg) { - std::unique_ptr task( - reinterpret_cast(arg)); + std::unique_ptr task(static_cast(arg)); assert(task); assert(task->db); Status s = task->db->CompactFiles( diff --git a/examples/compaction_filter_example.cc b/examples/compaction_filter_example.cc index ed1ada823f0..03a1952600d 100644 --- a/examples/compaction_filter_example.cc +++ b/examples/compaction_filter_example.cc @@ -10,8 +10,8 @@ class MyMerge : public ROCKSDB_NAMESPACE::MergeOperator { public: - virtual bool FullMergeV2(const MergeOperationInput& merge_in, - MergeOperationOutput* merge_out) const override { + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { merge_out->new_value.clear(); if (merge_in.existing_value != nullptr) { merge_out->new_value.assign(merge_in.existing_value->data(), diff --git a/examples/multi_processes_example.cc b/examples/multi_processes_example.cc index 93c54d75569..b9a6cbe207d 100644 --- a/examples/multi_processes_example.cc +++ b/examples/multi_processes_example.cc @@ -64,7 +64,7 @@ const std::vector& GetColumnFamilyNames() { inline bool IsLittleEndian() { uint32_t x = 1; - return *reinterpret_cast(&x) != 0; + return *static_cast(&x) != 0; } static std::atomic& ShouldSecondaryWait() { @@ -75,7 +75,7 @@ static std::atomic& ShouldSecondaryWait() { static std::string Key(uint64_t k) { std::string ret; if (IsLittleEndian()) { - ret.append(reinterpret_cast(&k), sizeof(k)); + ret.append(static_cast(&k), sizeof(k)); } else { char buf[sizeof(k)]; buf[0] = k & 0xff; diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index 78ea6f7feeb..1adbf3846ea 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -177,7 +177,7 @@ Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm, Status DeleteScheduler::MarkAsTrash(const std::string& file_path, std::string* trash_file) { // Sanity check of the path - size_t idx = file_path.rfind("/"); + size_t idx = file_path.rfind('/'); if (idx == std::string::npos || idx == file_path.size() - 1) { return Status::InvalidArgument("file_path is corrupted"); } @@ -367,7 +367,7 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted)); TEST_SYNC_POINT_CALLBACK( "DeleteScheduler::DeleteTrashFile::AfterSyncDir", - reinterpret_cast(const_cast(&dir_to_sync))); + static_cast(const_cast(&dir_to_sync))); } } if (s.ok()) { diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc index 46e834879a3..25d9f1acd8d 100644 --- a/file/delete_scheduler_test.cc +++ b/file/delete_scheduler_test.cc @@ -131,7 +131,7 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DeleteScheduler::DeleteTrashFile::AfterSyncDir", [&](void* arg) { dir_synced++; - std::string* dir = reinterpret_cast(arg); + std::string* dir = static_cast(arg); EXPECT_EQ(dummy_files_dirs_[0], *dir); }); diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc index da4a1d0b9dd..73050aed793 100644 --- a/file/file_prefetch_buffer.cc +++ b/file/file_prefetch_buffer.cc @@ -22,71 +22,71 @@ namespace ROCKSDB_NAMESPACE { -void FilePrefetchBuffer::CalculateOffsetAndLen(size_t alignment, - uint64_t offset, - size_t roundup_len, - uint32_t index, bool refit_tail, - uint64_t& chunk_len) { - uint64_t chunk_offset_in_buffer = 0; +void FilePrefetchBuffer::PrepareBufferForRead(BufferInfo* buf, size_t alignment, + uint64_t offset, + size_t roundup_len, + bool refit_tail, + uint64_t& aligned_useful_len) { + uint64_t aligned_useful_offset_in_buf = 0; bool copy_data_to_new_buffer = false; // Check if requested bytes are in the existing buffer_. // If only a few bytes exist -- reuse them & read only what is really needed. // This is typically the case of incremental reading of data. // If no bytes exist in buffer -- full pread. - if (DoesBufferContainData(index) && IsOffsetInBuffer(offset, index)) { + if (buf->DoesBufferContainData() && buf->IsOffsetInBuffer(offset)) { // Only a few requested bytes are in the buffer. memmove those chunk of // bytes to the beginning, and memcpy them back into the new buffer if a // new buffer is created. - chunk_offset_in_buffer = Rounddown( - static_cast(offset - bufs_[index].offset_), alignment); - chunk_len = static_cast(bufs_[index].buffer_.CurrentSize()) - - chunk_offset_in_buffer; - assert(chunk_offset_in_buffer % alignment == 0); - assert(chunk_len % alignment == 0); - assert(chunk_offset_in_buffer + chunk_len <= - bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()); - if (chunk_len > 0) { + aligned_useful_offset_in_buf = + Rounddown(static_cast(offset - buf->offset_), alignment); + aligned_useful_len = static_cast(buf->CurrentSize()) - + aligned_useful_offset_in_buf; + assert(aligned_useful_offset_in_buf % alignment == 0); + assert(aligned_useful_len % alignment == 0); + assert(aligned_useful_offset_in_buf + aligned_useful_len <= + buf->offset_ + buf->CurrentSize()); + if (aligned_useful_len > 0) { copy_data_to_new_buffer = true; } else { // this reset is not necessary, but just to be safe. - chunk_offset_in_buffer = 0; + aligned_useful_offset_in_buf = 0; } } // Create a new buffer only if current capacity is not sufficient, and memcopy - // bytes from old buffer if needed (i.e., if chunk_len is greater than 0). - if (bufs_[index].buffer_.Capacity() < roundup_len) { - bufs_[index].buffer_.Alignment(alignment); - bufs_[index].buffer_.AllocateNewBuffer( + // bytes from old buffer if needed (i.e., if aligned_useful_len is greater + // than 0). + if (buf->buffer_.Capacity() < roundup_len) { + buf->buffer_.Alignment(alignment); + buf->buffer_.AllocateNewBuffer( static_cast(roundup_len), copy_data_to_new_buffer, - chunk_offset_in_buffer, static_cast(chunk_len)); - } else if (chunk_len > 0 && refit_tail) { + aligned_useful_offset_in_buf, static_cast(aligned_useful_len)); + } else if (aligned_useful_len > 0 && refit_tail) { // New buffer not needed. But memmove bytes from tail to the beginning since - // chunk_len is greater than 0. - bufs_[index].buffer_.RefitTail(static_cast(chunk_offset_in_buffer), - static_cast(chunk_len)); - } else if (chunk_len > 0) { - // For async prefetching, it doesn't call RefitTail with chunk_len > 0. - // Allocate new buffer if needed because aligned buffer calculate remaining - // buffer as capacity_ - cursize_ which might not be the case in this as we - // are not refitting. - // TODO akanksha: Update the condition when asynchronous prefetching is - // stable. - bufs_[index].buffer_.Alignment(alignment); - bufs_[index].buffer_.AllocateNewBuffer( + // aligned_useful_len is greater than 0. + buf->buffer_.RefitTail(static_cast(aligned_useful_offset_in_buf), + static_cast(aligned_useful_len)); + } else if (aligned_useful_len > 0) { + // For async prefetching, it doesn't call RefitTail with aligned_useful_len + // > 0. Allocate new buffer if needed because aligned buffer calculate + // remaining buffer as capacity - cursize which might not be the case in + // this as it's not refitting. + // TODO: Use refit_tail for async prefetching too. + buf->buffer_.Alignment(alignment); + buf->buffer_.AllocateNewBuffer( static_cast(roundup_len), copy_data_to_new_buffer, - chunk_offset_in_buffer, static_cast(chunk_len)); + aligned_useful_offset_in_buf, static_cast(aligned_useful_len)); } } -Status FilePrefetchBuffer::Read(const IOOptions& opts, +Status FilePrefetchBuffer::Read(BufferInfo* buf, const IOOptions& opts, RandomAccessFileReader* reader, - uint64_t read_len, uint64_t chunk_len, - uint64_t rounddown_start, uint32_t index) { + uint64_t read_len, uint64_t aligned_useful_len, + uint64_t start_offset) { Slice result; - Status s = reader->Read(opts, rounddown_start + chunk_len, read_len, &result, - bufs_[index].buffer_.BufferStart() + chunk_len, - /*aligned_buf=*/nullptr); + char* to_buf = buf->buffer_.BufferStart() + aligned_useful_len; + Status s = reader->Read(opts, start_offset + aligned_useful_len, read_len, + &result, to_buf, /*aligned_buf=*/nullptr); #ifndef NDEBUG if (result.size() < read_len) { // Fake an IO error to force db_stress fault injection to ignore @@ -97,20 +97,25 @@ Status FilePrefetchBuffer::Read(const IOOptions& opts, if (!s.ok()) { return s; } + if (result.data() != to_buf) { + // If the read is coming from some other buffer already in memory (such as + // mmap) then it would be inefficient to create another copy in this + // FilePrefetchBuffer. The caller is expected to exclude this case. + assert(false); + return Status::Corruption("File read didn't populate our buffer"); + } if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) { RecordTick(stats_, PREFETCH_BYTES, read_len); } - // Update the buffer offset and size. - bufs_[index].offset_ = rounddown_start; - bufs_[index].buffer_.Size(static_cast(chunk_len) + result.size()); + // Update the buffer size. + buf->buffer_.Size(static_cast(aligned_useful_len) + result.size()); return s; } -Status FilePrefetchBuffer::ReadAsync(const IOOptions& opts, +Status FilePrefetchBuffer::ReadAsync(BufferInfo* buf, const IOOptions& opts, RandomAccessFileReader* reader, - uint64_t read_len, - uint64_t rounddown_start, uint32_t index) { + uint64_t read_len, uint64_t start_offset) { TEST_SYNC_POINT("FilePrefetchBuffer::ReadAsync"); // callback for async read request. auto fp = std::bind(&FilePrefetchBuffer::PrefetchAsyncCallback, this, @@ -118,18 +123,17 @@ Status FilePrefetchBuffer::ReadAsync(const IOOptions& opts, FSReadRequest req; Slice result; req.len = read_len; - req.offset = rounddown_start; + req.offset = start_offset; req.result = result; - req.scratch = bufs_[index].buffer_.BufferStart(); - bufs_[index].async_req_len_ = req.len; + req.scratch = buf->buffer_.BufferStart(); + buf->async_req_len_ = req.len; - Status s = - reader->ReadAsync(req, opts, fp, &(bufs_[index].pos_), - &(bufs_[index].io_handle_), &(bufs_[index].del_fn_), - /*aligned_buf=*/nullptr); + Status s = reader->ReadAsync(req, opts, fp, buf, &(buf->io_handle_), + &(buf->del_fn_), /*aligned_buf =*/nullptr); req.status.PermitUncheckedError(); if (s.ok()) { - bufs_[index].async_read_in_progress_ = true; + RecordTick(stats_, PREFETCH_BYTES, read_len); + buf->async_read_in_progress_ = true; } return s; } @@ -140,28 +144,32 @@ Status FilePrefetchBuffer::Prefetch(const IOOptions& opts, if (!enable_ || reader == nullptr) { return Status::OK(); } + + assert(num_buffers_ == 1); + + AllocateBufferIfEmpty(); + BufferInfo* buf = GetFirstBuffer(); + TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start"); - if (offset + n <= bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) { - // All requested bytes are already in the curr_ buffer. So no need to Read - // again. + if (offset + n <= buf->offset_ + buf->CurrentSize()) { + // All requested bytes are already in the buffer. So no need to Read again. return Status::OK(); } size_t alignment = reader->file()->GetRequiredBufferAlignment(); - size_t offset_ = static_cast(offset); - uint64_t rounddown_offset = Rounddown(offset_, alignment); - uint64_t roundup_end = Roundup(offset_ + n, alignment); - uint64_t roundup_len = roundup_end - rounddown_offset; - assert(roundup_len >= alignment); - assert(roundup_len % alignment == 0); + uint64_t rounddown_offset = offset, roundup_end = 0, aligned_useful_len = 0; + size_t read_len = 0; - uint64_t chunk_len = 0; - CalculateOffsetAndLen(alignment, offset, roundup_len, curr_, - true /*refit_tail*/, chunk_len); - size_t read_len = static_cast(roundup_len - chunk_len); + ReadAheadSizeTuning(buf, /*read_curr_block=*/true, + /*refit_tail=*/true, rounddown_offset, alignment, 0, n, + rounddown_offset, roundup_end, read_len, + aligned_useful_len); - Status s = Read(opts, reader, read_len, chunk_len, rounddown_offset, curr_); + Status s; + if (read_len > 0) { + s = Read(buf, opts, reader, read_len, aligned_useful_len, rounddown_offset); + } if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && s.ok()) { RecordInHistogram(stats_, TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, read_len); @@ -169,25 +177,27 @@ Status FilePrefetchBuffer::Prefetch(const IOOptions& opts, return s; } -// Copy data from src to third buffer. -void FilePrefetchBuffer::CopyDataToBuffer(uint32_t src, uint64_t& offset, +// Copy data from src to overlap_buf_. +void FilePrefetchBuffer::CopyDataToBuffer(BufferInfo* src, uint64_t& offset, size_t& length) { if (length == 0) { return; } - uint64_t copy_offset = (offset - bufs_[src].offset_); + + uint64_t copy_offset = (offset - src->offset_); size_t copy_len = 0; - if (IsDataBlockInBuffer(offset, length, src)) { + if (src->IsDataBlockInBuffer(offset, length)) { // All the bytes are in src. copy_len = length; } else { - copy_len = bufs_[src].buffer_.CurrentSize() - copy_offset; + copy_len = src->CurrentSize() - copy_offset; } - memcpy(bufs_[2].buffer_.BufferStart() + bufs_[2].buffer_.CurrentSize(), - bufs_[src].buffer_.BufferStart() + copy_offset, copy_len); + BufferInfo* dst = overlap_buf_; + memcpy(dst->buffer_.BufferStart() + dst->CurrentSize(), + src->buffer_.BufferStart() + copy_offset, copy_len); - bufs_[2].buffer_.Size(bufs_[2].buffer_.CurrentSize() + copy_len); + dst->buffer_.Size(dst->CurrentSize() + copy_len); // Update offset and length. offset += copy_len; @@ -196,51 +206,43 @@ void FilePrefetchBuffer::CopyDataToBuffer(uint32_t src, uint64_t& offset, // length > 0 indicates it has consumed all data from the src buffer and it // still needs to read more other buffer. if (length > 0) { - bufs_[src].buffer_.Clear(); + FreeFrontBuffer(); } } -// Clear the buffers if it contains outdated data. Outdated data can be -// because previous sequential reads were read from the cache instead of these -// buffer. In that case outdated IOs should be aborted. -void FilePrefetchBuffer::AbortIOIfNeeded(uint64_t offset) { - uint32_t second = curr_ ^ 1; +// Clear the buffers if it contains outdated data. Outdated data can be because +// previous sequential reads were read from the cache instead of these buffer. +// In that case outdated IOs should be aborted. +void FilePrefetchBuffer::AbortOutdatedIO(uint64_t offset) { std::vector handles; - autovector buf_pos; - if (IsBufferOutdatedWithAsyncProgress(offset, curr_)) { - handles.emplace_back(bufs_[curr_].io_handle_); - buf_pos.emplace_back(curr_); - } - if (IsBufferOutdatedWithAsyncProgress(offset, second)) { - handles.emplace_back(bufs_[second].io_handle_); - buf_pos.emplace_back(second); + std::vector tmp_buf; + for (auto& buf : bufs_) { + if (buf->IsBufferOutdatedWithAsyncProgress(offset)) { + handles.emplace_back(buf->io_handle_); + tmp_buf.emplace_back(buf); + } } + if (!handles.empty()) { StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS); Status s = fs_->AbortIO(handles); assert(s.ok()); } - for (auto& pos : buf_pos) { - // Release io_handle. - DestroyAndClearIOHandle(pos); - } - - if (bufs_[second].io_handle_ == nullptr) { - bufs_[second].async_read_in_progress_ = false; - } - - if (bufs_[curr_].io_handle_ == nullptr) { - bufs_[curr_].async_read_in_progress_ = false; + for (auto& buf : tmp_buf) { + if (buf->async_read_in_progress_) { + DestroyAndClearIOHandle(buf); + buf->async_read_in_progress_ = false; + } + buf->ClearBuffer(); } } void FilePrefetchBuffer::AbortAllIOs() { - uint32_t second = curr_ ^ 1; std::vector handles; - for (uint32_t i = 0; i < 2; i++) { - if (bufs_[i].async_read_in_progress_ && bufs_[i].io_handle_ != nullptr) { - handles.emplace_back(bufs_[i].io_handle_); + for (auto& buf : bufs_) { + if (buf->async_read_in_progress_ && buf->io_handle_ != nullptr) { + handles.emplace_back(buf->io_handle_); } } if (!handles.empty()) { @@ -249,367 +251,421 @@ void FilePrefetchBuffer::AbortAllIOs() { assert(s.ok()); } - // Release io_handles. - if (bufs_[curr_].io_handle_ != nullptr && bufs_[curr_].del_fn_ != nullptr) { - DestroyAndClearIOHandle(curr_); - } else { - bufs_[curr_].async_read_in_progress_ = false; - } - - if (bufs_[second].io_handle_ != nullptr && bufs_[second].del_fn_ != nullptr) { - DestroyAndClearIOHandle(second); - } else { - bufs_[second].async_read_in_progress_ = false; + for (auto& buf : bufs_) { + if (buf->io_handle_ != nullptr && buf->del_fn_ != nullptr) { + DestroyAndClearIOHandle(buf); + } + buf->async_read_in_progress_ = false; } } -// Clear the buffers if it contains outdated data. Outdated data can be -// because previous sequential reads were read from the cache instead of these -// buffer. -void FilePrefetchBuffer::UpdateBuffersIfNeeded(uint64_t offset) { - uint32_t second = curr_ ^ 1; - if (IsBufferOutdated(offset, curr_)) { - bufs_[curr_].buffer_.Clear(); - } - if (IsBufferOutdated(offset, second)) { - bufs_[second].buffer_.Clear(); - } - - { - // In case buffers do not align, reset second buffer. This can happen in - // case readahead_size is set. - if (!bufs_[second].async_read_in_progress_ && - !bufs_[curr_].async_read_in_progress_) { - if (DoesBufferContainData(curr_)) { - if (bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() != - bufs_[second].offset_) { - bufs_[second].buffer_.Clear(); - } - } else { - if (!IsOffsetInBuffer(offset, second)) { - bufs_[second].buffer_.Clear(); - } - } +// Clear the buffers if it contains outdated data wrt offset. Outdated data can +// be because previous sequential reads were read from the cache instead of +// these buffer or there is IOError while filling the buffers. +// +// offset - the offset requested to be read. This API makes sure that the +// front/first buffer in bufs_ should contain this offset, otherwise, all +// buffers will be freed. +void FilePrefetchBuffer::ClearOutdatedData(uint64_t offset, size_t length) { + while (!IsBufferQueueEmpty()) { + BufferInfo* buf = GetFirstBuffer(); + // Offset is greater than this buffer's end offset. + if (buf->IsBufferOutdated(offset)) { + FreeFrontBuffer(); + } else { + break; } } - // If data starts from second buffer, make it curr_. Second buffer can be - // either partial filled, full or async read is in progress. - if (bufs_[second].async_read_in_progress_) { - if (IsOffsetInBufferWithAsyncProgress(offset, second)) { - curr_ = curr_ ^ 1; + if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) { + return; + } + + BufferInfo* buf = GetFirstBuffer(); + + if (buf->async_read_in_progress_) { + FreeEmptyBuffers(); + return; + } + + // Below handles the case for Overlapping buffers (NumBuffersAllocated > 1). + bool abort_io = false; + + if (buf->DoesBufferContainData() && buf->IsOffsetInBuffer(offset)) { + BufferInfo* next_buf = bufs_[1]; + if (/* next buffer doesn't align with first buffer and requested data + overlaps with next buffer */ + ((buf->offset_ + buf->CurrentSize() != next_buf->offset_) && + (offset + length > buf->offset_ + buf->CurrentSize()))) { + abort_io = true; } } else { - if (DoesBufferContainData(second) && IsOffsetInBuffer(offset, second)) { - assert(bufs_[curr_].async_read_in_progress_ || - bufs_[curr_].buffer_.CurrentSize() == 0); - curr_ = curr_ ^ 1; + // buffer with offset doesn't contain data or offset doesn't lie in this + // buffer. + buf->ClearBuffer(); + abort_io = true; + } + + if (abort_io) { + AbortAllIOs(); + // Clear all buffers after first. + for (size_t i = 1; i < bufs_.size(); ++i) { + bufs_[i]->ClearBuffer(); } } + FreeEmptyBuffers(); + assert(IsBufferQueueEmpty() || buf->IsOffsetInBuffer(offset)); } -void FilePrefetchBuffer::PollAndUpdateBuffersIfNeeded(uint64_t offset) { - if (bufs_[curr_].async_read_in_progress_ && fs_ != nullptr) { - if (bufs_[curr_].io_handle_ != nullptr) { +void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) { + BufferInfo* buf = GetFirstBuffer(); + + if (buf->async_read_in_progress_ && fs_ != nullptr) { + if (buf->io_handle_ != nullptr) { // Wait for prefetch data to complete. // No mutex is needed as async_read_in_progress behaves as mutex and is // updated by main thread only. std::vector handles; - handles.emplace_back(bufs_[curr_].io_handle_); + handles.emplace_back(buf->io_handle_); StopWatch sw(clock_, stats_, POLL_WAIT_MICROS); fs_->Poll(handles, 1).PermitUncheckedError(); } // Reset and Release io_handle after the Poll API as request has been // completed. - DestroyAndClearIOHandle(curr_); + DestroyAndClearIOHandle(buf); + } + + // Always call outdated data after Poll as Buffers might be out of sync w.r.t + // offset and length. + ClearOutdatedData(offset, length); +} + +// ReadAheadSizeTuning API calls readaheadsize_cb_ +// (BlockBasedTableIterator::BlockCacheLookupForReadAheadSize) to lookup in the +// cache and tune the start and end offsets based on cache hits/misses. +// +// Arguments - +// read_curr_block : True if this call was due to miss in the cache and +// FilePrefetchBuffer wants to read that block +// synchronously. +// False if current call is to prefetch additional data in +// extra buffers through ReadAsync API. +// prev_buf_end_offset : End offset of the previous buffer. It's used in case +// of ReadAsync to make sure it doesn't read anything from +// previous buffer which is already prefetched. +void FilePrefetchBuffer::ReadAheadSizeTuning( + BufferInfo* buf, bool read_curr_block, bool refit_tail, + uint64_t prev_buf_end_offset, size_t alignment, size_t length, + size_t readahead_size, uint64_t& start_offset, uint64_t& end_offset, + size_t& read_len, uint64_t& aligned_useful_len) { + uint64_t updated_start_offset = Rounddown(start_offset, alignment); + uint64_t updated_end_offset = + Roundup(start_offset + length + readahead_size, alignment); + uint64_t initial_end_offset = updated_end_offset; + uint64_t initial_start_offset = updated_start_offset; + + // Callback to tune the start and end offsets. + if (readaheadsize_cb_ != nullptr && readahead_size > 0) { + readaheadsize_cb_(read_curr_block, updated_start_offset, + updated_end_offset); + } + + // read_len will be 0 and there is nothing to read/prefetch. + if (updated_start_offset == updated_end_offset) { + start_offset = end_offset = updated_start_offset; + UpdateReadAheadTrimmedStat((initial_end_offset - initial_start_offset), + (updated_end_offset - updated_start_offset)); + return; + } + + assert(updated_start_offset < updated_end_offset); + + if (!read_curr_block) { + // Handle the case when callback added block handles which are already + // prefetched and nothing new needs to be prefetched. In that case end + // offset updated by callback will be less than prev_buf_end_offset which + // means data has been already prefetched. + if (updated_end_offset <= prev_buf_end_offset) { + start_offset = end_offset = prev_buf_end_offset; + UpdateReadAheadTrimmedStat((initial_end_offset - initial_start_offset), + (end_offset - start_offset)); + return; + } + } + + // Realign if start and end offsets are not aligned after tuning. + start_offset = Rounddown(updated_start_offset, alignment); + end_offset = Roundup(updated_end_offset, alignment); + + if (!read_curr_block && start_offset < prev_buf_end_offset) { + // Previous buffer already contains the data till prev_buf_end_offset + // because of alignment. Update the start offset after that to avoid + // prefetching it again. + start_offset = prev_buf_end_offset; } - UpdateBuffersIfNeeded(offset); + + uint64_t roundup_len = end_offset - start_offset; + + PrepareBufferForRead(buf, alignment, start_offset, roundup_len, refit_tail, + aligned_useful_len); + assert(roundup_len >= aligned_useful_len); + + // Update the buffer offset. + buf->offset_ = start_offset; + // Update the initial end offset of this buffer which will be the starting + // offset of next prefetch. + buf->initial_end_offset_ = initial_end_offset; + read_len = static_cast(roundup_len - aligned_useful_len); + + UpdateReadAheadTrimmedStat((initial_end_offset - initial_start_offset), + (end_offset - start_offset)); } +// If data is overlapping between two buffers then during this call: +// - data from first buffer is copied into overlapping buffer, +// - first is removed from bufs_ and freed so that it can be used for async +// prefetching of further data. Status FilePrefetchBuffer::HandleOverlappingData( const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, - size_t length, size_t readahead_size, bool& copy_to_third_buffer, + size_t length, size_t readahead_size, bool& copy_to_overlap_buffer, uint64_t& tmp_offset, size_t& tmp_length) { + // No Overlapping of data between 2 buffers. + if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) { + return Status::OK(); + } + Status s; size_t alignment = reader->file()->GetRequiredBufferAlignment(); - uint32_t second; + + BufferInfo* buf = GetFirstBuffer(); // Check if the first buffer has the required offset and the async read is // still in progress. This should only happen if a prefetch was initiated // by Seek, but the next access is at another offset. - if (bufs_[curr_].async_read_in_progress_ && - IsOffsetInBufferWithAsyncProgress(offset, curr_)) { - PollAndUpdateBuffersIfNeeded(offset); + if (buf->async_read_in_progress_ && + buf->IsOffsetInBufferWithAsyncProgress(offset)) { + PollIfNeeded(offset, length); + } + + if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) { + return Status::OK(); } - second = curr_ ^ 1; - // If data is overlapping over two buffers, copy the data from curr_ and - // call ReadAsync on curr_. - if (!bufs_[curr_].async_read_in_progress_ && DoesBufferContainData(curr_) && - IsOffsetInBuffer(offset, curr_) && - (/*Data extends over curr_ buffer and second buffer either has data or in + BufferInfo* next_buf = bufs_[1]; + + // If data is overlapping over two buffers, copy the data from front and + // call ReadAsync on freed buffer. + if (!buf->async_read_in_progress_ && buf->DoesBufferContainData() && + buf->IsOffsetInBuffer(offset) && + (/*Data extends over two buffers and second buffer either has data or in process of population=*/ - (offset + length > bufs_[second].offset_) && - (bufs_[second].async_read_in_progress_ || - DoesBufferContainData(second)))) { - // Allocate new buffer to third buffer; - bufs_[2].buffer_.Clear(); - bufs_[2].buffer_.Alignment(alignment); - bufs_[2].buffer_.AllocateNewBuffer(length); - bufs_[2].offset_ = offset; - copy_to_third_buffer = true; - - CopyDataToBuffer(curr_, tmp_offset, tmp_length); - - // Call async prefetching on curr_ since data has been consumed in curr_ - // only if data lies within second buffer. - size_t second_size = bufs_[second].async_read_in_progress_ - ? bufs_[second].async_req_len_ - : bufs_[second].buffer_.CurrentSize(); - uint64_t rounddown_start = bufs_[second].offset_ + second_size; - // Second buffer might be out of bound if first buffer already prefetched - // that data. - if (tmp_offset + tmp_length <= bufs_[second].offset_ + second_size && - !IsOffsetOutOfBound(rounddown_start)) { - uint64_t roundup_end = - Roundup(rounddown_start + readahead_size, alignment); - uint64_t roundup_len = roundup_end - rounddown_start; - uint64_t chunk_len = 0; - CalculateOffsetAndLen(alignment, rounddown_start, roundup_len, curr_, - false, chunk_len); - assert(chunk_len == 0); - assert(roundup_len >= chunk_len); - - bufs_[curr_].offset_ = rounddown_start; - uint64_t read_len = static_cast(roundup_len - chunk_len); - s = ReadAsync(opts, reader, read_len, rounddown_start, curr_); - if (!s.ok()) { - DestroyAndClearIOHandle(curr_); - bufs_[curr_].buffer_.Clear(); - return s; + (offset + length > next_buf->offset_) && + (next_buf->async_read_in_progress_ || + next_buf->DoesBufferContainData()))) { + // Allocate new buffer to overlap_buf_. + overlap_buf_->ClearBuffer(); + overlap_buf_->buffer_.Alignment(alignment); + overlap_buf_->buffer_.AllocateNewBuffer(length); + overlap_buf_->offset_ = offset; + copy_to_overlap_buffer = true; + + CopyDataToBuffer(buf, tmp_offset, tmp_length); + UpdateStats(/*found_in_buffer=*/false, overlap_buf_->CurrentSize()); + + // Call async prefetching on freed buffer since data has been consumed + // only if requested data lies within next buffer. + size_t second_size = next_buf->async_read_in_progress_ + ? next_buf->async_req_len_ + : next_buf->CurrentSize(); + uint64_t start_offset = next_buf->initial_end_offset_; + + // If requested bytes - tmp_offset + tmp_length are in next buffer, freed + // buffer can go for further prefetching. + // If requested bytes are not in next buffer, next buffer has to go for sync + // call to get remaining requested bytes. In that case it shouldn't go for + // async prefetching as async prefetching calculates offset based on + // previous buffer end offset and previous buffer has to go for sync + // prefetching. + + if (tmp_offset + tmp_length <= next_buf->offset_ + second_size) { + AllocateBuffer(); + BufferInfo* new_buf = GetLastBuffer(); + size_t read_len = 0; + uint64_t end_offset = start_offset, aligned_useful_len = 0; + + ReadAheadSizeTuning(new_buf, /*read_curr_block=*/false, + /*refit_tail=*/false, next_buf->offset_ + second_size, + alignment, + /*length=*/0, readahead_size, start_offset, + end_offset, read_len, aligned_useful_len); + if (read_len > 0) { + s = ReadAsync(new_buf, opts, reader, read_len, start_offset); + if (!s.ok()) { + DestroyAndClearIOHandle(new_buf); + FreeLastBuffer(); + return s; + } } } - curr_ = curr_ ^ 1; } return s; } -// If async_io is enabled in case of sequential reads, PrefetchAsyncInternal is -// called. When buffers are switched, we clear the curr_ buffer as we assume the + +// When data is outdated, we clear the first buffer and free it as the // data has been consumed because of sequential reads. -// Data in buffers will always be sequential with curr_ following second and -// not vice versa. // // Scenarios for prefetching asynchronously: -// Case1: If both buffers are empty, prefetch n + readahead_size_/2 bytes -// synchronously in curr_ and prefetch readahead_size_/2 async in second -// buffer. -// Case2: If second buffer has partial or full data, make it current and -// prefetch readahead_size_/2 async in second buffer. In case of -// partial data, prefetch remaining bytes from size n synchronously to -// fulfill the requested bytes request. -// Case3: If curr_ has partial data, prefetch remaining bytes from size n -// synchronously in curr_ to fulfill the requested bytes request and -// prefetch readahead_size_/2 bytes async in second buffer. -// Case4: (Special case) If data is in both buffers, copy requested data from -// curr_, send async request on curr_, wait for poll to fill second -// buffer (if any), and copy remaining data from second buffer to third -// buffer. -Status FilePrefetchBuffer::PrefetchAsyncInternal(const IOOptions& opts, - RandomAccessFileReader* reader, - uint64_t offset, size_t length, - size_t readahead_size, - bool& copy_to_third_buffer) { +// Case1: If all buffers are in free_bufs_, prefetch n + readahead_size_/2 bytes +// synchronously in first buffer and prefetch readahead_size_/2 async in +// remaining buffers (num_buffers_ -1 ). +// Case2: If first buffer has partial data, prefetch readahead_size_/2 async in +// remaining buffers. In case of partial data, prefetch remaining bytes +// from size n synchronously to fulfill the requested bytes request. +// Case5: (Special case) If data is overlapping in two buffers, copy requested +// data from first, free that buffer to send for async request, wait for +// poll to fill next buffer (if any), and copy remaining data from that +// buffer to overlap buffer. +Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t length, + size_t readahead_size, + bool& copy_to_overlap_buffer) { if (!enable_) { return Status::OK(); } - TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsyncInternal:Start"); + TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start"); size_t alignment = reader->file()->GetRequiredBufferAlignment(); Status s; uint64_t tmp_offset = offset; size_t tmp_length = length; + size_t original_length = length; - // 1. Abort IO and swap buffers if needed to point curr_ to first buffer with - // data. + // Abort outdated IO. if (!explicit_prefetch_submitted_) { - AbortIOIfNeeded(offset); + AbortOutdatedIO(offset); + FreeEmptyBuffers(); } - UpdateBuffersIfNeeded(offset); + ClearOutdatedData(offset, length); - // 2. Handle overlapping data over two buffers. If data is overlapping then - // during this call: - // - data from curr_ is copied into third buffer, - // - curr_ is send for async prefetching of further data if second buffer - // contains remaining requested data or in progress for async prefetch, - // - switch buffers and curr_ now points to second buffer to copy remaining - // data. + // Handle overlapping data over two buffers. s = HandleOverlappingData(opts, reader, offset, length, readahead_size, - copy_to_third_buffer, tmp_offset, tmp_length); + copy_to_overlap_buffer, tmp_offset, tmp_length); if (!s.ok()) { return s; } - // 3. Call Poll only if data is needed for the second buffer. - // - Return if whole data is in curr_ and second buffer is in progress or + AllocateBufferIfEmpty(); + BufferInfo* buf = GetFirstBuffer(); + + // Call Poll only if data is needed for the second buffer. + // - Return if whole data is in first and second buffer is in progress or // already full. // - If second buffer is empty, it will go for ReadAsync for second buffer. - if (!bufs_[curr_].async_read_in_progress_ && DoesBufferContainData(curr_) && - IsDataBlockInBuffer(offset, length, curr_)) { - // Whole data is in curr_. - UpdateBuffersIfNeeded(offset); - if (!IsSecondBuffEligibleForPrefetching()) { + if (!buf->async_read_in_progress_ && buf->DoesBufferContainData() && + buf->IsDataBlockInBuffer(offset, length)) { + // Whole data is in buffer. + if (!IsEligibleForFurtherPrefetching()) { + UpdateStats(/*found_in_buffer=*/true, original_length); return s; } } else { - // After poll request, curr_ might be empty because of IOError in - // callback while reading or may contain required data. - PollAndUpdateBuffersIfNeeded(offset); - } - - if (copy_to_third_buffer) { - offset = tmp_offset; - length = tmp_length; - } - - // 4. After polling and swapping buffers, if all the requested bytes are in - // curr_, it will only go for async prefetching. - // copy_to_third_buffer is a special case so it will be handled separately. - if (!copy_to_third_buffer && DoesBufferContainData(curr_) && - IsDataBlockInBuffer(offset, length, curr_)) { - offset += length; - length = 0; - - // Since async request was submitted directly by calling PrefetchAsync in - // last call, we don't need to prefetch further as this call is to poll - // the data submitted in previous call. - if (explicit_prefetch_submitted_) { - return s; - } - if (!IsSecondBuffEligibleForPrefetching()) { - return s; - } - } - - uint32_t second = curr_ ^ 1; - assert(!bufs_[curr_].async_read_in_progress_); - - // In case because of some IOError curr_ got empty, abort IO for second as - // well. Otherwise data might not align if more data needs to be read in curr_ - // which might overlap with second buffer. - if (!DoesBufferContainData(curr_) && bufs_[second].async_read_in_progress_) { - if (bufs_[second].io_handle_ != nullptr) { - std::vector handles; - handles.emplace_back(bufs_[second].io_handle_); - { - StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS); - Status status = fs_->AbortIO(handles); - assert(status.ok()); + PollIfNeeded(tmp_offset, tmp_length); + } + + AllocateBufferIfEmpty(); + buf = GetFirstBuffer(); + offset = tmp_offset; + length = tmp_length; + + // After polling, if all the requested bytes are in first buffer, it will only + // go for async prefetching. + if (buf->DoesBufferContainData()) { + if (copy_to_overlap_buffer) { + // Data is overlapping i.e. some of the data has been copied to overlap + // buffer and remaining will be updated below. + size_t initial_buf_size = overlap_buf_->CurrentSize(); + CopyDataToBuffer(buf, offset, length); + UpdateStats( + /*found_in_buffer=*/false, + overlap_buf_->CurrentSize() - initial_buf_size); + + // Length == 0: All the requested data has been copied to overlap buffer + // and it has already gone for async prefetching. It can return without + // doing anything further. + // Length > 0: More data needs to be consumed so it will continue async + // and sync prefetching and copy the remaining data to overlap buffer in + // the end. + if (length == 0) { + UpdateStats(/*found_in_buffer=*/true, length); + return s; + } + } else { + if (buf->IsDataBlockInBuffer(offset, length)) { + offset += length; + length = 0; + // Since async request was submitted directly by calling PrefetchAsync + // in last call, we don't need to prefetch further as this call is to + // poll the data submitted in previous call. + if (explicit_prefetch_submitted_) { + return s; + } + if (!IsEligibleForFurtherPrefetching()) { + UpdateStats(/*found_in_buffer=*/true, original_length); + return s; + } } - } - DestroyAndClearIOHandle(second); - bufs_[second].buffer_.Clear(); - } - - // 5. Data is overlapping i.e. some of the data has been copied to third - // buffer and remaining will be updated below. - if (copy_to_third_buffer && DoesBufferContainData(curr_)) { - CopyDataToBuffer(curr_, offset, length); - - // Length == 0: All the requested data has been copied to third buffer and - // it has already gone for async prefetching. It can return without doing - // anything further. - // Length > 0: More data needs to be consumed so it will continue async - // and sync prefetching and copy the remaining data to third buffer in the - // end. - if (length == 0) { - return s; } } - // 6. Go for ReadAsync and Read (if needed). - size_t prefetch_size = length + readahead_size; - size_t _offset = static_cast(offset); + AllocateBufferIfEmpty(); + buf = GetFirstBuffer(); - // offset and size alignment for curr_ buffer with synchronous prefetching - uint64_t rounddown_start1 = Rounddown(_offset, alignment); - uint64_t roundup_end1 = Roundup(_offset + prefetch_size, alignment); - uint64_t roundup_len1 = roundup_end1 - rounddown_start1; - assert(roundup_len1 >= alignment); - assert(roundup_len1 % alignment == 0); - uint64_t chunk_len1 = 0; - uint64_t read_len1 = 0; + assert(!buf->async_read_in_progress_); - assert(!bufs_[second].async_read_in_progress_ && - !DoesBufferContainData(second)); + // Go for ReadAsync and Read (if needed). + // offset and size alignment for first buffer with synchronous prefetching + uint64_t start_offset1 = offset, end_offset1 = 0, aligned_useful_len1 = 0; + size_t read_len1 = 0; // For length == 0, skip the synchronous prefetching. read_len1 will be 0. if (length > 0) { - CalculateOffsetAndLen(alignment, offset, roundup_len1, curr_, - false /*refit_tail*/, chunk_len1); - assert(roundup_len1 >= chunk_len1); - read_len1 = static_cast(roundup_len1 - chunk_len1); - } - - // Prefetch in second buffer only if readahead_size_ > 0. - if (readahead_size_ > 0) { - // offset and size alignment for second buffer for asynchronous - // prefetching - uint64_t rounddown_start2 = roundup_end1; - uint64_t roundup_end2 = - Roundup(rounddown_start2 + readahead_size, alignment); - - // For length == 0, do the asynchronous prefetching in second instead of - // synchronous prefetching in curr_. - if (length == 0) { - rounddown_start2 = - bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize(); - roundup_end2 = Roundup(rounddown_start2 + prefetch_size, alignment); + if (buf->IsOffsetInBuffer(offset)) { + UpdateStats(/*found_in_buffer=*/false, + (buf->offset_ + buf->CurrentSize() - offset)); } + ReadAheadSizeTuning(buf, /*read_curr_block=*/true, /*refit_tail*/ + true, start_offset1, alignment, length, readahead_size, + start_offset1, end_offset1, read_len1, + aligned_useful_len1); + } else { + UpdateStats(/*found_in_buffer=*/true, original_length); + } - // Second buffer might be out of bound if first buffer already prefetched - // that data. - if (!IsOffsetOutOfBound(rounddown_start2)) { - uint64_t roundup_len2 = roundup_end2 - rounddown_start2; - uint64_t chunk_len2 = 0; - CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second, - false /*refit_tail*/, chunk_len2); - assert(chunk_len2 == 0); - // Update the buffer offset. - bufs_[second].offset_ = rounddown_start2; - assert(roundup_len2 >= chunk_len2); - uint64_t read_len2 = static_cast(roundup_len2 - chunk_len2); - s = ReadAsync(opts, reader, read_len2, rounddown_start2, second); - if (!s.ok()) { - DestroyAndClearIOHandle(second); - bufs_[second].buffer_.Clear(); - return s; - } + // Prefetch in remaining buffer only if readahead_size > 0. + if (readahead_size > 0) { + s = PrefetchRemBuffers(opts, reader, end_offset1, alignment, + readahead_size); + if (!s.ok()) { + return s; } } if (read_len1 > 0) { - s = Read(opts, reader, read_len1, chunk_len1, rounddown_start1, curr_); + s = Read(buf, opts, reader, read_len1, aligned_useful_len1, start_offset1); if (!s.ok()) { - if (bufs_[second].io_handle_ != nullptr) { - std::vector handles; - handles.emplace_back(bufs_[second].io_handle_); - { - StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS); - Status status = fs_->AbortIO(handles); - assert(status.ok()); - } - } - DestroyAndClearIOHandle(second); - bufs_[second].buffer_.Clear(); - bufs_[curr_].buffer_.Clear(); + AbortAllIOs(); + FreeAllBuffers(); return s; } } - // Copy remaining requested bytes to third_buffer. - if (copy_to_third_buffer && length > 0) { - CopyDataToBuffer(curr_, offset, length); + + // Copy remaining requested bytes to overlap_buffer. No need to update stats + // as data is prefetched during this call. + if (copy_to_overlap_buffer && length > 0) { + CopyDataToBuffer(buf, offset, length); } return s; } @@ -618,7 +674,7 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, size_t n, Slice* result, Status* status, - bool for_compaction /* = false */) { + bool for_compaction) { bool ret = TryReadFromCacheUntracked(opts, reader, offset, n, result, status, for_compaction); if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) { @@ -633,87 +689,7 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, bool FilePrefetchBuffer::TryReadFromCacheUntracked( const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, - size_t n, Slice* result, Status* status, - bool for_compaction /* = false */) { - if (track_min_offset_ && offset < min_offset_read_) { - min_offset_read_ = static_cast(offset); - } - if (!enable_ || (offset < bufs_[curr_].offset_)) { - return false; - } - - // If the buffer contains only a few of the requested bytes: - // If readahead is enabled: prefetch the remaining bytes + readahead bytes - // and satisfy the request. - // If readahead is not enabled: return false. - TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache", - &readahead_size_); - if (offset + n > bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) { - if (readahead_size_ > 0) { - Status s; - assert(reader != nullptr); - assert(max_readahead_size_ >= readahead_size_); - if (for_compaction) { - s = Prefetch(opts, reader, offset, std::max(n, readahead_size_)); - } else { - if (IsOffsetInBuffer(offset, curr_)) { - RecordTick(stats_, PREFETCH_BYTES_USEFUL, - bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() - - offset); - } - if (implicit_auto_readahead_) { - if (!IsEligibleForPrefetch(offset, n)) { - // Ignore status as Prefetch is not called. - s.PermitUncheckedError(); - return false; - } - } - size_t current_readahead_size = ReadAheadSizeTuning(offset, n); - s = Prefetch(opts, reader, offset, n + current_readahead_size); - } - if (!s.ok()) { - if (status) { - *status = s; - } -#ifndef NDEBUG - IGNORE_STATUS_IF_ERROR(s); -#endif - return false; - } - readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2); - } else { - return false; - } - } else if (!for_compaction) { - RecordTick(stats_, PREFETCH_HITS); - RecordTick(stats_, PREFETCH_BYTES_USEFUL, n); - } - UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/); - - uint64_t offset_in_buffer = offset - bufs_[curr_].offset_; - *result = Slice(bufs_[curr_].buffer_.BufferStart() + offset_in_buffer, n); - return true; -} - -bool FilePrefetchBuffer::TryReadFromCacheAsync(const IOOptions& opts, - RandomAccessFileReader* reader, - uint64_t offset, size_t n, - Slice* result, Status* status) { - bool ret = - TryReadFromCacheAsyncUntracked(opts, reader, offset, n, result, status); - if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) { - if (ret) { - RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT); - } else { - RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS); - } - } - return ret; -} - -bool FilePrefetchBuffer::TryReadFromCacheAsyncUntracked( - const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, - size_t n, Slice* result, Status* status) { + size_t n, Slice* result, Status* status, bool for_compaction) { if (track_min_offset_ && offset < min_offset_read_) { min_offset_read_ = static_cast(offset); } @@ -729,30 +705,32 @@ bool FilePrefetchBuffer::TryReadFromCacheAsyncUntracked( // Random offset called. So abort the IOs. if (prev_offset_ != offset) { AbortAllIOs(); - bufs_[curr_].buffer_.Clear(); - bufs_[curr_ ^ 1].buffer_.Clear(); + FreeAllBuffers(); explicit_prefetch_submitted_ = false; return false; } } - if (!explicit_prefetch_submitted_ && offset < bufs_[curr_].offset_) { + AllocateBufferIfEmpty(); + BufferInfo* buf = GetFirstBuffer(); + + if (!explicit_prefetch_submitted_ && offset < buf->offset_) { return false; } bool prefetched = false; - bool copy_to_third_buffer = false; + bool copy_to_overlap_buffer = false; // If the buffer contains only a few of the requested bytes: - // If readahead is enabled: prefetch the remaining bytes + readahead bytes + // If readahead is enabled: prefetch the remaining bytes + readahead + // bytes // and satisfy the request. // If readahead is not enabled: return false. TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache", &readahead_size_); if (explicit_prefetch_submitted_ || - (bufs_[curr_].async_read_in_progress_ || - offset + n > - bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize())) { + (buf->async_read_in_progress_ || + offset + n > buf->offset_ + buf->CurrentSize())) { // In case readahead_size is trimmed (=0), we still want to poll the data // submitted with explicit_prefetch_submitted_=true. if (readahead_size_ > 0 || explicit_prefetch_submitted_) { @@ -760,21 +738,27 @@ bool FilePrefetchBuffer::TryReadFromCacheAsyncUntracked( assert(reader != nullptr); assert(max_readahead_size_ >= readahead_size_); - if (implicit_auto_readahead_) { - if (!IsEligibleForPrefetch(offset, n)) { - // Ignore status as Prefetch is not called. - s.PermitUncheckedError(); - return false; + if (for_compaction) { + s = Prefetch(opts, reader, offset, std::max(n, readahead_size_)); + } else { + if (implicit_auto_readahead_) { + if (!IsEligibleForPrefetch(offset, n)) { + // Ignore status as Prefetch is not called. + s.PermitUncheckedError(); + return false; + } } - } - UpdateReadAheadSizeForUpperBound(offset, n); + // Prefetch n + readahead_size_/2 synchronously as remaining + // readahead_size_/2 will be prefetched asynchronously if num_buffers_ + // > 1. + s = PrefetchInternal( + opts, reader, offset, n, + (num_buffers_ > 1 ? readahead_size_ / 2 : readahead_size_), + copy_to_overlap_buffer); + explicit_prefetch_submitted_ = false; + } - // Prefetch n + readahead_size_/2 synchronously as remaining - // readahead_size_/2 will be prefetched asynchronously. - s = PrefetchAsyncInternal(opts, reader, offset, n, readahead_size_ / 2, - copy_to_third_buffer); - explicit_prefetch_submitted_ = false; if (!s.ok()) { if (status) { *status = s; @@ -788,25 +772,28 @@ bool FilePrefetchBuffer::TryReadFromCacheAsyncUntracked( } else { return false; } + } else if (!for_compaction) { + UpdateStats(/*found_in_buffer=*/true, n); } - UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/); + UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false); - uint32_t index = curr_; - if (copy_to_third_buffer) { - index = 2; + buf = GetFirstBuffer(); + if (copy_to_overlap_buffer) { + buf = overlap_buf_; } - uint64_t offset_in_buffer = offset - bufs_[index].offset_; - *result = Slice(bufs_[index].buffer_.BufferStart() + offset_in_buffer, n); + uint64_t offset_in_buffer = offset - buf->offset_; + *result = Slice(buf->buffer_.BufferStart() + offset_in_buffer, n); if (prefetched) { readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2); } return true; } -void FilePrefetchBuffer::PrefetchAsyncCallback(const FSReadRequest& req, +void FilePrefetchBuffer::PrefetchAsyncCallback(FSReadRequest& req, void* cb_arg) { - uint32_t index = *(static_cast(cb_arg)); + BufferInfo* buf = static_cast(cb_arg); + #ifndef NDEBUG if (req.result.size() < req.len) { // Fake an IO error to force db_stress fault injection to ignore @@ -817,19 +804,18 @@ void FilePrefetchBuffer::PrefetchAsyncCallback(const FSReadRequest& req, #endif if (req.status.ok()) { - if (req.offset + req.result.size() <= - bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()) { + if (req.offset + req.result.size() <= buf->offset_ + buf->CurrentSize()) { // All requested bytes are already in the buffer or no data is read // because of EOF. So no need to update. return; } - if (req.offset < bufs_[index].offset_) { + if (req.offset < buf->offset_) { // Next block to be read has changed (Recent read was not a sequential // read). So ignore this read. return; } - size_t current_size = bufs_[index].buffer_.CurrentSize(); - bufs_[index].buffer_.Size(current_size + req.result.size()); + size_t current_size = buf->CurrentSize(); + buf->buffer_.Size(current_size + req.result.size()); } } @@ -848,133 +834,119 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts, explicit_prefetch_submitted_ = false; bool is_eligible_for_prefetching = false; - UpdateReadAheadSizeForUpperBound(offset, n); if (readahead_size_ > 0 && (!implicit_auto_readahead_ || num_file_reads_ >= num_file_reads_for_auto_readahead_)) { - is_eligible_for_prefetching = true; + is_eligible_for_prefetching = true; } - // 1. Cancel any pending async read to make code simpler as buffers can be out + // Cancel any pending async read to make code simpler as buffers can be out // of sync. AbortAllIOs(); - - // 2. Clear outdated data. - UpdateBuffersIfNeeded(offset); - uint32_t second = curr_ ^ 1; - // Since PrefetchAsync can be called on non sequential reads. So offset can - // be less than curr_ buffers' offset. In that case also it clears both - // buffers. - if (DoesBufferContainData(curr_) && !IsOffsetInBuffer(offset, curr_)) { - bufs_[curr_].buffer_.Clear(); - bufs_[second].buffer_.Clear(); + // Free empty buffers after aborting IOs. + FreeEmptyBuffers(); + ClearOutdatedData(offset, n); + + // - Since PrefetchAsync can be called on non sequential reads. So offset can + // be less than first buffers' offset. In that case it clears all + // buffers. + // - In case of tuning of readahead_size, on Reseek, we have to clear all + // buffers otherwise, we may end up with inconsistent BlockHandles in queue + // and data in buffer. + if (!IsBufferQueueEmpty()) { + BufferInfo* buf = GetFirstBuffer(); + if (readaheadsize_cb_ != nullptr || !buf->IsOffsetInBuffer(offset)) { + FreeAllBuffers(); + } } UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false); bool data_found = false; - // 3. If curr_ has full data. - if (DoesBufferContainData(curr_) && IsDataBlockInBuffer(offset, n, curr_)) { - uint64_t offset_in_buffer = offset - bufs_[curr_].offset_; - *result = Slice(bufs_[curr_].buffer_.BufferStart() + offset_in_buffer, n); - data_found = true; - // Update num_file_reads_ as TryReadFromCacheAsync won't be called for - // poll and update num_file_reads_ if data is found. - num_file_reads_++; - - // 3.1 If second also has some data or is not eligible for prefetching, - // return. - if (!is_eligible_for_prefetching || DoesBufferContainData(second)) { - return Status::OK(); + // If first buffer has full data. + if (!IsBufferQueueEmpty()) { + BufferInfo* buf = GetFirstBuffer(); + if (buf->DoesBufferContainData() && buf->IsDataBlockInBuffer(offset, n)) { + uint64_t offset_in_buffer = offset - buf->offset_; + *result = Slice(buf->buffer_.BufferStart() + offset_in_buffer, n); + data_found = true; + UpdateStats(/*found_in_buffer=*/true, n); + + // Update num_file_reads_ as TryReadFromCacheAsync won't be called for + // poll and update num_file_reads_ if data is found. + num_file_reads_++; + + // If next buffer contains some data or is not eligible for prefetching, + // return. + if (!is_eligible_for_prefetching || NumBuffersAllocated() > 1) { + return Status::OK(); + } + } else { + // Partial data in first buffer. Clear it to return continous data in one + // buffer. + FreeAllBuffers(); } - } else { - // Partial data in curr_. - bufs_[curr_].buffer_.Clear(); } - bufs_[second].buffer_.Clear(); + + std::string msg; Status s; size_t alignment = reader->file()->GetRequiredBufferAlignment(); - size_t prefetch_size = is_eligible_for_prefetching ? readahead_size_ / 2 : 0; + size_t readahead_size = is_eligible_for_prefetching ? readahead_size_ / 2 : 0; size_t offset_to_read = static_cast(offset); - uint64_t rounddown_start1 = 0; - uint64_t roundup_end1 = 0; - uint64_t rounddown_start2 = 0; - uint64_t roundup_end2 = 0; - uint64_t chunk_len1 = 0; - uint64_t chunk_len2 = 0; + uint64_t start_offset1 = offset, end_offset1 = 0, aligned_useful_len1 = 0; size_t read_len1 = 0; - size_t read_len2 = 0; - // - If curr_ is empty. - // - Call async read for full data + prefetch_size on curr_. - // - Call async read for prefetch_size on second if eligible. - // - If curr_ is filled. - // - prefetch_size on second. + AllocateBufferIfEmpty(); + BufferInfo* buf = GetFirstBuffer(); + + // - If first buffer is empty. + // - Call async read for full data + readahead_size on first buffer. + // - Call async read for readahead_size on all remaining buffers if + // eligible. + // - If first buffer contains data, + // - Call async read for readahead_size on all remaining buffers if + // eligible. + // Calculate length and offsets for reading. - if (!DoesBufferContainData(curr_)) { + if (!buf->DoesBufferContainData()) { uint64_t roundup_len1; - // Prefetch full data + prefetch_size in curr_. + // Prefetch full data + readahead_size in the first buffer. if (is_eligible_for_prefetching || reader->use_direct_io()) { - rounddown_start1 = Rounddown(offset_to_read, alignment); - roundup_end1 = Roundup(offset_to_read + n + prefetch_size, alignment); - roundup_len1 = roundup_end1 - rounddown_start1; - assert(roundup_len1 >= alignment); - assert(roundup_len1 % alignment == 0); + ReadAheadSizeTuning(buf, /*read_curr_block=*/true, /*refit_tail=*/false, + /*prev_buf_end_offset=*/start_offset1, alignment, n, + readahead_size, start_offset1, end_offset1, read_len1, + aligned_useful_len1); } else { - rounddown_start1 = offset_to_read; - roundup_end1 = offset_to_read + n; - roundup_len1 = roundup_end1 - rounddown_start1; + // No alignment or extra prefetching. + start_offset1 = offset_to_read; + end_offset1 = offset_to_read + n; + roundup_len1 = end_offset1 - start_offset1; + PrepareBufferForRead(buf, alignment, start_offset1, roundup_len1, false, + aligned_useful_len1); + assert(aligned_useful_len1 == 0); + assert(roundup_len1 >= aligned_useful_len1); + read_len1 = static_cast(roundup_len1); + buf->offset_ = start_offset1; } - CalculateOffsetAndLen(alignment, rounddown_start1, roundup_len1, curr_, - false, chunk_len1); - assert(chunk_len1 == 0); - assert(roundup_len1 >= chunk_len1); - read_len1 = static_cast(roundup_len1); - bufs_[curr_].offset_ = rounddown_start1; - } - if (is_eligible_for_prefetching) { - if (DoesBufferContainData(curr_)) { - rounddown_start2 = - bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize(); - } else { - rounddown_start2 = roundup_end1; - } - - // Second buffer might be out of bound if first buffer already prefetched - // that data. - if (!IsOffsetOutOfBound(rounddown_start2)) { - roundup_end2 = Roundup(rounddown_start2 + prefetch_size, alignment); - uint64_t roundup_len2 = roundup_end2 - rounddown_start2; - - CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second, - false, chunk_len2); - assert(chunk_len2 == 0); - assert(roundup_len2 >= chunk_len2); - read_len2 = static_cast(roundup_len2 - chunk_len2); - // Update the buffer offset. - bufs_[second].offset_ = rounddown_start2; + if (read_len1 > 0) { + s = ReadAsync(buf, opts, reader, read_len1, start_offset1); + if (!s.ok()) { + DestroyAndClearIOHandle(buf); + FreeLastBuffer(); + return s; + } + explicit_prefetch_submitted_ = true; + prev_len_ = 0; } } - if (read_len1) { - s = ReadAsync(opts, reader, read_len1, rounddown_start1, curr_); - if (!s.ok()) { - DestroyAndClearIOHandle(curr_); - bufs_[curr_].buffer_.Clear(); - return s; - } - explicit_prefetch_submitted_ = true; - prev_len_ = 0; - } - if (read_len2) { - TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:ExtraPrefetching"); - s = ReadAsync(opts, reader, read_len2, rounddown_start2, second); + if (is_eligible_for_prefetching) { + s = PrefetchRemBuffers(opts, reader, end_offset1, alignment, + readahead_size); if (!s.ok()) { - DestroyAndClearIOHandle(second); - bufs_[second].buffer_.Clear(); return s; } readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2); @@ -982,4 +954,39 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts, return (data_found ? Status::OK() : Status::TryAgain()); } +Status FilePrefetchBuffer::PrefetchRemBuffers(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t end_offset1, + size_t alignment, + size_t readahead_size) { + Status s; + while (NumBuffersAllocated() < num_buffers_) { + BufferInfo* prev_buf = GetLastBuffer(); + uint64_t start_offset2 = prev_buf->initial_end_offset_; + + AllocateBuffer(); + BufferInfo* new_buf = GetLastBuffer(); + + uint64_t end_offset2 = start_offset2, aligned_useful_len2 = 0; + size_t read_len2 = 0; + ReadAheadSizeTuning(new_buf, /*read_curr_block=*/false, + /*refit_tail=*/false, + /*prev_buf_end_offset=*/end_offset1, alignment, + /*length=*/0, readahead_size, start_offset2, + end_offset2, read_len2, aligned_useful_len2); + + if (read_len2 > 0) { + TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:ExtraPrefetching"); + s = ReadAsync(new_buf, opts, reader, read_len2, start_offset2); + if (!s.ok()) { + DestroyAndClearIOHandle(new_buf); + FreeLastBuffer(); + return s; + } + } + end_offset1 = end_offset2; + } + return s; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index d71b28ab816..dfa8389294f 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -31,7 +32,39 @@ namespace ROCKSDB_NAMESPACE { struct IOOptions; class RandomAccessFileReader; +struct ReadaheadParams { + ReadaheadParams() {} + + // The initial readahead size. + size_t initial_readahead_size = 0; + + // The maximum readahead size. + // If max_readahead_size > readahead_size, then readahead size will be doubled + // on every IO until max_readahead_size is hit. Typically this is set as a + // multiple of initial_readahead_size. initial_readahead_size should be + // greater than equal to initial_readahead_size. + size_t max_readahead_size = 0; + + // If true, Readahead is enabled implicitly by rocksdb + // after doing sequential scans for num_file_reads_for_auto_readahead. + bool implicit_auto_readahead = false; + + // TODO akanksha - Remove num_file_reads when BlockPrefetcher is refactored. + uint64_t num_file_reads = 0; + uint64_t num_file_reads_for_auto_readahead = 0; + + // Number of buffers to maintain that contains prefetched data. If num_buffers + // > 1 then buffers will be filled asynchronously whenever they get emptied. + size_t num_buffers = 1; +}; + struct BufferInfo { + void ClearBuffer() { + buffer_.Clear(); + initial_end_offset_ = 0; + async_req_len_ = 0; + } + AlignedBuffer buffer_; uint64_t offset_ = 0; @@ -50,8 +83,50 @@ struct BufferInfo { IOHandleDeleter del_fn_ = nullptr; - // pos represents the index of this buffer in vector of BufferInfo. - uint32_t pos_ = 0; + // initial_end_offset is used to keep track of the end offset of the buffer + // that was originally called. It's helpful in case of autotuning of readahead + // size when callback is made to BlockBasedTableIterator. + // initial end offset of this buffer which will be the starting + // offset of next prefetch. + // + // For example - if end offset of previous buffer was 100 and because of + // readahead_size optimization, end_offset was trimmed to 60. Then for next + // prefetch call, start_offset should be intialized to 100 i.e start_offset = + // buf->initial_end_offset_. + uint64_t initial_end_offset_ = 0; + + bool IsDataBlockInBuffer(uint64_t offset, size_t length) { + assert(async_read_in_progress_ == false); + return (offset >= offset_ && + offset + length <= offset_ + buffer_.CurrentSize()); + } + + bool IsOffsetInBuffer(uint64_t offset) { + assert(async_read_in_progress_ == false); + return (offset >= offset_ && offset < offset_ + buffer_.CurrentSize()); + } + + bool DoesBufferContainData() { + assert(async_read_in_progress_ == false); + return buffer_.CurrentSize() > 0; + } + + bool IsBufferOutdated(uint64_t offset) { + return (!async_read_in_progress_ && DoesBufferContainData() && + offset >= offset_ + buffer_.CurrentSize()); + } + + bool IsBufferOutdatedWithAsyncProgress(uint64_t offset) { + return (async_read_in_progress_ && io_handle_ != nullptr && + offset >= offset_ + async_req_len_); + } + + bool IsOffsetInBufferWithAsyncProgress(uint64_t offset) { + return (async_read_in_progress_ && offset >= offset_ && + offset < offset_ + async_req_len_); + } + + size_t CurrentSize() { return buffer_.CurrentSize(); } }; enum class FilePrefetchBufferUsage { @@ -60,66 +135,87 @@ enum class FilePrefetchBufferUsage { kUnknown, }; +// Implementation: +// FilePrefetchBuffer maintains a dequeu of free buffers (free_bufs_) with no +// data and bufs_ which contains the prefetched data. Whenever a buffer is +// consumed or is outdated (w.r.t. to requested offset), that buffer is cleared +// and returned to free_bufs_. +// +// If a buffer is available in free_bufs_, it's moved to bufs_ and is sent for +// prefetching. +// num_buffers_ defines how many buffers FilePrefetchBuffer can maintain at a +// time that contains prefetched data with num_buffers_ == bufs_.size() + +// free_bufs_.size(). +// +// If num_buffers_ == 1, it's a sequential read flow. Read API will be called on +// that one buffer whenever the data is requested and is not in the buffer. +// If num_buffers_ > 1, then the data is prefetched asynchronosuly in the +// buffers whenever the data is consumed from the buffers and that buffer is +// freed. +// If num_buffers > 1, then requested data can be overlapping between 2 buffers. +// To return the continuous buffer, overlap_buf_ is used. The requested data is +// copied from 2 buffers to the overlap_buf_ and overlap_buf_ is returned to +// the caller. + // FilePrefetchBuffer is a smart buffer to store and read data from a file. class FilePrefetchBuffer { public: // Constructor. // // All arguments are optional. - // readahead_size : the initial readahead size. - // max_readahead_size : the maximum readahead size. - // If max_readahead_size > readahead_size, the readahead size will be - // doubled on every IO until max_readahead_size is hit. - // Typically this is set as a multiple of readahead_size. - // max_readahead_size should be greater than equal to readahead_size. - // enable : controls whether reading from the buffer is enabled. - // If false, TryReadFromCache() always return false, and we only take stats - // for the minimum offset if track_min_offset = true. + // ReadaheadParams : Parameters to control the readahead behavior. + // enable : controls whether reading from the buffer is enabled. + // If false, TryReadFromCache() always return false, and we + // only take stats for the minimum offset if + // track_min_offset = true. + // See below NOTE about mmap reads. // track_min_offset : Track the minimum offset ever read and collect stats on - // it. Used for adaptable readahead of the file footer/metadata. - // implicit_auto_readahead : Readahead is enabled implicitly by rocksdb after - // doing sequential scans for two times. + // it. Used for adaptable readahead of the file + // footer/metadata. // - // Automatic readhead is enabled for a file if readahead_size - // and max_readahead_size are passed in. // A user can construct a FilePrefetchBuffer without any arguments, but use // `Prefetch` to load data into the buffer. + // NOTE: FilePrefetchBuffer is incompatible with prefetching from + // RandomAccessFileReaders using mmap reads, so it is common to use + // `!use_mmap_reads` for the `enable` parameter. FilePrefetchBuffer( - size_t readahead_size = 0, size_t max_readahead_size = 0, - bool enable = true, bool track_min_offset = false, - bool implicit_auto_readahead = false, uint64_t num_file_reads = 0, - uint64_t num_file_reads_for_auto_readahead = 0, - uint64_t upper_bound_offset = 0, FileSystem* fs = nullptr, + const ReadaheadParams& readahead_params = {}, bool enable = true, + bool track_min_offset = false, FileSystem* fs = nullptr, SystemClock* clock = nullptr, Statistics* stats = nullptr, - const std::function& cb = nullptr, + const std::function& cb = nullptr, FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown) - : curr_(0), - readahead_size_(readahead_size), - initial_auto_readahead_size_(readahead_size), - max_readahead_size_(max_readahead_size), + : readahead_size_(readahead_params.initial_readahead_size), + initial_auto_readahead_size_(readahead_params.initial_readahead_size), + max_readahead_size_(readahead_params.max_readahead_size), min_offset_read_(std::numeric_limits::max()), enable_(enable), track_min_offset_(track_min_offset), - implicit_auto_readahead_(implicit_auto_readahead), + implicit_auto_readahead_(readahead_params.implicit_auto_readahead), prev_offset_(0), prev_len_(0), - num_file_reads_for_auto_readahead_(num_file_reads_for_auto_readahead), - num_file_reads_(num_file_reads), + num_file_reads_for_auto_readahead_( + readahead_params.num_file_reads_for_auto_readahead), + num_file_reads_(readahead_params.num_file_reads), explicit_prefetch_submitted_(false), fs_(fs), clock_(clock), stats_(stats), usage_(usage), - upper_bound_offset_(upper_bound_offset), - readaheadsize_cb_(cb) { + readaheadsize_cb_(cb), + num_buffers_(readahead_params.num_buffers) { assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) || (num_file_reads_ == 0)); - // If ReadOptions.async_io is enabled, data is asynchronously filled in - // second buffer while curr_ is being consumed. If data is overlapping in - // two buffers, data is copied to third buffer to return continuous buffer. - bufs_.resize(3); - for (uint32_t i = 0; i < 2; i++) { - bufs_[i].pos_ = i; + + // If num_buffers_ > 1, data is asynchronously filled in the + // queue. As result, data can be overlapping in two buffers. It copies the + // data to overlap_buf_ in order to to return continuous buffer. + if (num_buffers_ > 1) { + overlap_buf_ = new BufferInfo(); + } + + free_bufs_.resize(num_buffers_); + for (uint32_t i = 0; i < num_buffers_; i++) { + free_bufs_[i] = new BufferInfo(); } } @@ -127,10 +223,9 @@ class FilePrefetchBuffer { // Abort any pending async read request before destroying the class object. if (fs_ != nullptr) { std::vector handles; - for (uint32_t i = 0; i < 2; i++) { - if (bufs_[i].async_read_in_progress_ && - bufs_[i].io_handle_ != nullptr) { - handles.emplace_back(bufs_[i].io_handle_); + for (auto& buf : bufs_) { + if (buf->async_read_in_progress_ && buf->io_handle_ != nullptr) { + handles.emplace_back(buf->io_handle_); } } if (!handles.empty()) { @@ -138,60 +233,63 @@ class FilePrefetchBuffer { Status s = fs_->AbortIO(handles); assert(s.ok()); } + + for (auto& buf : bufs_) { + if (buf->io_handle_ != nullptr) { + DestroyAndClearIOHandle(buf); + buf->ClearBuffer(); + } + buf->async_read_in_progress_ = false; + } } // Prefetch buffer bytes discarded. uint64_t bytes_discarded = 0; - // Iterated over 2 buffers. - for (int i = 0; i < 2; i++) { - int first = i; - int second = i ^ 1; - - if (DoesBufferContainData(first)) { - // If last block was read completely from first and some bytes in - // first buffer are still unconsumed. - if (prev_offset_ >= bufs_[first].offset_ && - prev_offset_ + prev_len_ < - bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize()) { - bytes_discarded += bufs_[first].buffer_.CurrentSize() - - (prev_offset_ + prev_len_ - bufs_[first].offset_); + // Iterated over buffers. + for (auto& buf : bufs_) { + if (buf->DoesBufferContainData()) { + // If last read was from this block and some bytes are still unconsumed. + if (prev_offset_ >= buf->offset_ && + prev_offset_ + prev_len_ < buf->offset_ + buf->CurrentSize()) { + bytes_discarded += + buf->CurrentSize() - (prev_offset_ + prev_len_ - buf->offset_); } - // If data was in second buffer and some/whole block bytes were read - // from second buffer. - else if (prev_offset_ < bufs_[first].offset_ && - !DoesBufferContainData(second)) { - // If last block read was completely from different buffer, this - // buffer is unconsumed. - if (prev_offset_ + prev_len_ <= bufs_[first].offset_) { - bytes_discarded += bufs_[first].buffer_.CurrentSize(); - } - // If last block read overlaps with this buffer and some data is - // still unconsumed and previous buffer (second) is not cleared. - else if (prev_offset_ + prev_len_ > bufs_[first].offset_ && - bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize() == - bufs_[second].offset_) { - bytes_discarded += bufs_[first].buffer_.CurrentSize() - - (/*bytes read from this buffer=*/prev_len_ - - (bufs_[first].offset_ - prev_offset_)); - } + // If last read was from previous blocks and this block is unconsumed. + else if (prev_offset_ < buf->offset_ && + prev_offset_ + prev_len_ <= buf->offset_) { + bytes_discarded += buf->CurrentSize(); } } } - for (uint32_t i = 0; i < 2; i++) { - // Release io_handle. - DestroyAndClearIOHandle(i); - } RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded); + + for (auto& buf : bufs_) { + delete buf; + buf = nullptr; + } + + for (auto& buf : free_bufs_) { + delete buf; + buf = nullptr; + } + + if (overlap_buf_ != nullptr) { + delete overlap_buf_; + overlap_buf_ = nullptr; + } } bool Enabled() const { return enable_; } - // Load data into the buffer from a file. + // Called externally by user to only load data into the buffer from a file + // with num_buffers_ should be set to default(1). + // // opts : the IO options to use. // reader : the file reader. // offset : the file offset to start reading from. // n : the number of bytes to read. + // Status Prefetch(const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, size_t n); @@ -225,23 +323,16 @@ class FilePrefetchBuffer { uint64_t offset, size_t n, Slice* result, Status* s, bool for_compaction = false); - bool TryReadFromCacheAsync(const IOOptions& opts, - RandomAccessFileReader* reader, uint64_t offset, - size_t n, Slice* result, Status* status); - // The minimum `offset` ever passed to TryReadFromCache(). This will nly be // tracked if track_min_offset = true. size_t min_offset_read() const { return min_offset_read_; } - size_t GetPrefetchOffset() const { return bufs_[curr_].offset_; } + size_t GetPrefetchOffset() const { return bufs_.front()->offset_; } // Called in case of implicit auto prefetching. void UpdateReadPattern(const uint64_t& offset, const size_t& len, bool decrease_readaheadsize) { if (decrease_readaheadsize) { - // Since this block was eligible for prefetch but it was found in - // cache, so check and decrease the readahead_size by 8KB (default) - // if eligible. DecreaseReadAheadIfEligible(offset, len); } prev_offset_ = offset; @@ -256,6 +347,10 @@ class FilePrefetchBuffer { void DecreaseReadAheadIfEligible(uint64_t offset, size_t size, size_t value = DEFAULT_DECREMENT) { + if (bufs_.empty()) { + return; + } + // Decrease the readahead_size if // - its enabled internally by RocksDB (implicit_auto_readahead_) and, // - readahead_size is greater than 0 and, @@ -265,11 +360,12 @@ class FilePrefetchBuffer { // - block is sequential with the previous read and, // - num_file_reads_ + 1 (including this read) > // num_file_reads_for_auto_readahead_ - size_t curr_size = bufs_[curr_].async_read_in_progress_ - ? bufs_[curr_].async_req_len_ - : bufs_[curr_].buffer_.CurrentSize(); + + size_t curr_size = bufs_.front()->async_read_in_progress_ + ? bufs_.front()->async_req_len_ + : bufs_.front()->CurrentSize(); if (implicit_auto_readahead_ && readahead_size_ > 0) { - if ((offset + size > bufs_[curr_].offset_ + curr_size) && + if ((offset + size > bufs_.front()->offset_ + curr_size) && IsBlockSequential(offset) && (num_file_reads_ + 1 > num_file_reads_for_auto_readahead_)) { readahead_size_ = @@ -280,46 +376,49 @@ class FilePrefetchBuffer { } // Callback function passed to underlying FS in case of asynchronous reads. - void PrefetchAsyncCallback(const FSReadRequest& req, void* cb_arg); - - void ResetUpperBoundOffset(uint64_t upper_bound_offset) { - upper_bound_offset_ = upper_bound_offset; - readahead_size_ = initial_auto_readahead_size_; + void PrefetchAsyncCallback(FSReadRequest& req, void* cb_arg); + + void TEST_GetBufferOffsetandSize( + std::vector>& buffer_info) { + for (size_t i = 0; i < bufs_.size(); i++) { + buffer_info[i].first = bufs_[i]->offset_; + buffer_info[i].second = bufs_[i]->async_read_in_progress_ + ? bufs_[i]->async_req_len_ + : bufs_[i]->CurrentSize(); + } } private: // Calculates roundoff offset and length to be prefetched based on alignment // and data present in buffer_. It also allocates new buffer or refit tail if // required. - void CalculateOffsetAndLen(size_t alignment, uint64_t offset, - size_t roundup_len, uint32_t index, - bool refit_tail, uint64_t& chunk_len); + void PrepareBufferForRead(BufferInfo* buf, size_t alignment, uint64_t offset, + size_t roundup_len, bool refit_tail, + uint64_t& aligned_useful_len); - void AbortIOIfNeeded(uint64_t offset); + void AbortOutdatedIO(uint64_t offset); void AbortAllIOs(); - void UpdateBuffersIfNeeded(uint64_t offset); + void ClearOutdatedData(uint64_t offset, size_t len); - // It calls Poll API if any there is any pending asynchronous request. It then - // checks if data is in any buffer. It clears the outdated data and swaps the - // buffers if required. - void PollAndUpdateBuffersIfNeeded(uint64_t offset); + // It calls Poll API to check for any pending asynchronous request. + void PollIfNeeded(uint64_t offset, size_t len); - Status PrefetchAsyncInternal(const IOOptions& opts, - RandomAccessFileReader* reader, uint64_t offset, - size_t length, size_t readahead_size, - bool& copy_to_third_buffer); + Status PrefetchInternal(const IOOptions& opts, RandomAccessFileReader* reader, + uint64_t offset, size_t length, size_t readahead_size, + bool& copy_to_third_buffer); - Status Read(const IOOptions& opts, RandomAccessFileReader* reader, - uint64_t read_len, uint64_t chunk_len, uint64_t rounddown_start, - uint32_t index); + Status Read(BufferInfo* buf, const IOOptions& opts, + RandomAccessFileReader* reader, uint64_t read_len, + uint64_t aligned_useful_len, uint64_t start_offset); - Status ReadAsync(const IOOptions& opts, RandomAccessFileReader* reader, - uint64_t read_len, uint64_t rounddown_start, uint32_t index); + Status ReadAsync(BufferInfo* buf, const IOOptions& opts, + RandomAccessFileReader* reader, uint64_t read_len, + uint64_t start_offset); - // Copy the data from src to third buffer. - void CopyDataToBuffer(uint32_t src, uint64_t& offset, size_t& length); + // Copy the data from src to overlap_buf_. + void CopyDataToBuffer(BufferInfo* src, uint64_t& offset, size_t& length); bool IsBlockSequential(const size_t& offset) { return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset)); @@ -355,64 +454,24 @@ class FilePrefetchBuffer { return true; } - // Helper functions. - bool IsDataBlockInBuffer(uint64_t offset, size_t length, uint32_t index) { - return (offset >= bufs_[index].offset_ && - offset + length <= - bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()); - } - bool IsOffsetInBuffer(uint64_t offset, uint32_t index) { - return (offset >= bufs_[index].offset_ && - offset < bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()); - } - bool DoesBufferContainData(uint32_t index) { - return bufs_[index].buffer_.CurrentSize() > 0; - } - bool IsBufferOutdated(uint64_t offset, uint32_t index) { - return ( - !bufs_[index].async_read_in_progress_ && DoesBufferContainData(index) && - offset >= bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()); - } - bool IsBufferOutdatedWithAsyncProgress(uint64_t offset, uint32_t index) { - return (bufs_[index].async_read_in_progress_ && - bufs_[index].io_handle_ != nullptr && - offset >= bufs_[index].offset_ + bufs_[index].async_req_len_); - } - bool IsOffsetInBufferWithAsyncProgress(uint64_t offset, uint32_t index) { - return (bufs_[index].async_read_in_progress_ && - offset >= bufs_[index].offset_ && - offset < bufs_[index].offset_ + bufs_[index].async_req_len_); - } - - bool IsSecondBuffEligibleForPrefetching() { - uint32_t second = curr_ ^ 1; - if (bufs_[second].async_read_in_progress_) { - return false; - } - assert(!bufs_[curr_].async_read_in_progress_); - - if (DoesBufferContainData(curr_) && DoesBufferContainData(second) && - (bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() == - bufs_[second].offset_)) { + bool IsEligibleForFurtherPrefetching() { + if (free_bufs_.empty()) { return false; } - // Readahead size can be 0 because of trimming. if (readahead_size_ == 0) { return false; } - - bufs_[second].buffer_.Clear(); return true; } - void DestroyAndClearIOHandle(uint32_t index) { - if (bufs_[index].io_handle_ != nullptr && bufs_[index].del_fn_ != nullptr) { - bufs_[index].del_fn_(bufs_[index].io_handle_); - bufs_[index].io_handle_ = nullptr; - bufs_[index].del_fn_ = nullptr; + void DestroyAndClearIOHandle(BufferInfo* buf) { + if (buf->io_handle_ != nullptr && buf->del_fn_ != nullptr) { + buf->del_fn_(buf->io_handle_); + buf->io_handle_ = nullptr; + buf->del_fn_ = nullptr; } - bufs_[index].async_read_in_progress_ = false; + buf->async_read_in_progress_ = false; } Status HandleOverlappingData(const IOOptions& opts, @@ -427,49 +486,102 @@ class FilePrefetchBuffer { Status* s, bool for_compaction = false); - bool TryReadFromCacheAsyncUntracked(const IOOptions& opts, - RandomAccessFileReader* reader, - uint64_t offset, size_t n, Slice* result, - Status* status); - - void UpdateReadAheadSizeForUpperBound(uint64_t offset, size_t n) { - // Adjust readhahead_size till upper_bound if upper_bound_offset_ is - // set. - if (readahead_size_ > 0 && upper_bound_offset_ > 0 && - upper_bound_offset_ > offset) { - if (upper_bound_offset_ < offset + n + readahead_size_) { - readahead_size_ = (upper_bound_offset_ - offset) - n; - RecordTick(stats_, READAHEAD_TRIMMED); - } + void ReadAheadSizeTuning(BufferInfo* buf, bool read_curr_block, + bool refit_tail, uint64_t prev_buf_end_offset, + size_t alignment, size_t length, + size_t readahead_size, uint64_t& offset, + uint64_t& end_offset, size_t& read_len, + uint64_t& aligned_useful_len); + + void UpdateStats(bool found_in_buffer, size_t length_found) { + if (found_in_buffer) { + RecordTick(stats_, PREFETCH_HITS); + } + if (length_found > 0) { + RecordTick(stats_, PREFETCH_BYTES_USEFUL, length_found); } } - inline bool IsOffsetOutOfBound(uint64_t offset) { - if (upper_bound_offset_ > 0) { - return (offset >= upper_bound_offset_); + void UpdateReadAheadTrimmedStat(size_t initial_length, + size_t updated_length) { + if (initial_length != updated_length) { + RecordTick(stats_, READAHEAD_TRIMMED); } - return false; } - // Performs tuning to calculate readahead_size. - size_t ReadAheadSizeTuning(uint64_t offset, size_t n) { - UpdateReadAheadSizeForUpperBound(offset, n); + Status PrefetchRemBuffers(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t end_offset1, size_t alignment, + size_t readahead_size); + + // *** BEGIN APIs related to allocating and freeing buffers *** + bool IsBufferQueueEmpty() { return bufs_.empty(); } + + BufferInfo* GetFirstBuffer() { return bufs_.front(); } - if (readaheadsize_cb_ != nullptr && readahead_size_ > 0) { - size_t updated_readahead_size = 0; - readaheadsize_cb_(offset, readahead_size_, updated_readahead_size); - if (readahead_size_ != updated_readahead_size) { - RecordTick(stats_, READAHEAD_TRIMMED); + BufferInfo* GetLastBuffer() { return bufs_.back(); } + + size_t NumBuffersAllocated() { return bufs_.size(); } + + void AllocateBuffer() { + assert(!free_bufs_.empty()); + BufferInfo* buf = free_bufs_.front(); + free_bufs_.pop_front(); + bufs_.emplace_back(buf); + } + + void AllocateBufferIfEmpty() { + if (bufs_.empty()) { + AllocateBuffer(); + } + } + + void FreeFrontBuffer() { + BufferInfo* buf = bufs_.front(); + buf->ClearBuffer(); + bufs_.pop_front(); + free_bufs_.emplace_back(buf); + } + + void FreeLastBuffer() { + BufferInfo* buf = bufs_.back(); + buf->ClearBuffer(); + bufs_.pop_back(); + free_bufs_.emplace_back(buf); + } + + void FreeAllBuffers() { + while (!bufs_.empty()) { + BufferInfo* buf = bufs_.front(); + buf->ClearBuffer(); + bufs_.pop_front(); + free_bufs_.emplace_back(buf); + } + } + + void FreeEmptyBuffers() { + if (bufs_.empty()) { + return; + } + + std::deque tmp_buf; + while (!bufs_.empty()) { + BufferInfo* buf = bufs_.front(); + bufs_.pop_front(); + if (buf->async_read_in_progress_ || buf->DoesBufferContainData()) { + tmp_buf.emplace_back(buf); + } else { + free_bufs_.emplace_back(buf); } - return updated_readahead_size; } - return readahead_size_; + bufs_ = tmp_buf; } - std::vector bufs_; - // curr_ represents the index for bufs_ indicating which buffer is being - // consumed currently. - uint32_t curr_; + // *** END APIs related to allocating and freeing buffers *** + + std::deque bufs_; + std::deque free_bufs_; + BufferInfo* overlap_buf_ = nullptr; size_t readahead_size_; size_t initial_auto_readahead_size_; @@ -497,7 +609,7 @@ class FilePrefetchBuffer { uint64_t num_file_reads_; // If explicit_prefetch_submitted_ is set then it indicates RocksDB called - // PrefetchAsync to submit request. It needs to call TryReadFromCacheAsync to + // PrefetchAsync to submit request. It needs to call TryReadFromCache to // poll the submitted request without checking if data is sequential and // num_file_reads_. bool explicit_prefetch_submitted_; @@ -508,10 +620,10 @@ class FilePrefetchBuffer { FilePrefetchBufferUsage usage_; - // upper_bound_offset_ is set when ReadOptions.iterate_upper_bound and - // ReadOptions.auto_readahead_size are set to trim readahead_size upto - // upper_bound_offset_ during prefetching. - uint64_t upper_bound_offset_ = 0; - std::function readaheadsize_cb_; + std::function readaheadsize_cb_; + + // num_buffers_ is the number of buffers maintained by FilePrefetchBuffer to + // prefetch the data at a time. + size_t num_buffers_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/file/file_util.cc b/file/file_util.cc index 9eee106378b..cd63642ae0c 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -19,16 +19,17 @@ namespace ROCKSDB_NAMESPACE { // Utility function to copy a file up to a specified length IOStatus CopyFile(FileSystem* fs, const std::string& source, + Temperature src_temp_hint, std::unique_ptr& dest_writer, uint64_t size, bool use_fsync, - const std::shared_ptr& io_tracer, - const Temperature temperature) { + const std::shared_ptr& io_tracer) { FileOptions soptions; IOStatus io_s; std::unique_ptr src_reader; + const IOOptions opts; { - soptions.temperature = temperature; + soptions.temperature = src_temp_hint; std::unique_ptr srcfile; io_s = fs->NewSequentialFile(source, soptions, &srcfile, nullptr); if (!io_s.ok()) { @@ -37,7 +38,7 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source, if (size == 0) { // default argument means copy everything - io_s = fs->GetFileSize(source, IOOptions(), &size, nullptr); + io_s = fs->GetFileSize(source, opts, &size, nullptr); if (!io_s.ok()) { return io_s; } @@ -60,37 +61,39 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source, if (slice.size() == 0) { return IOStatus::Corruption("file too small"); } - io_s = dest_writer->Append(slice); + + io_s = dest_writer->Append(opts, slice); if (!io_s.ok()) { return io_s; } size -= slice.size(); } - return dest_writer->Sync(use_fsync); + return dest_writer->Sync(opts, use_fsync); } IOStatus CopyFile(FileSystem* fs, const std::string& source, - const std::string& destination, uint64_t size, bool use_fsync, - const std::shared_ptr& io_tracer, - const Temperature temperature) { + Temperature src_temp_hint, const std::string& destination, + Temperature dst_temp, uint64_t size, bool use_fsync, + const std::shared_ptr& io_tracer) { FileOptions options; IOStatus io_s; std::unique_ptr dest_writer; { - options.temperature = temperature; + options.temperature = dst_temp; std::unique_ptr destfile; io_s = fs->NewWritableFile(destination, options, &destfile, nullptr); if (!io_s.ok()) { return io_s; } + // TODO: pass in Histograms if the destination file is sst or blob dest_writer.reset( new WritableFileWriter(std::move(destfile), destination, options)); } - return CopyFile(fs, source, dest_writer, size, use_fsync, io_tracer, - temperature); + return CopyFile(fs, source, src_temp_hint, dest_writer, size, use_fsync, + io_tracer); } // Utility function to create a file with the provided contents @@ -99,19 +102,21 @@ IOStatus CreateFile(FileSystem* fs, const std::string& destination, const EnvOptions soptions; IOStatus io_s; std::unique_ptr dest_writer; + const IOOptions opts; std::unique_ptr destfile; io_s = fs->NewWritableFile(destination, soptions, &destfile, nullptr); if (!io_s.ok()) { return io_s; } + // TODO: pass in Histograms if the destination file is sst or blob dest_writer.reset( new WritableFileWriter(std::move(destfile), destination, soptions)); - io_s = dest_writer->Append(Slice(contents)); + io_s = dest_writer->Append(opts, Slice(contents)); if (!io_s.ok()) { return io_s; } - return dest_writer->Sync(use_fsync); + return dest_writer->Sync(opts, use_fsync); } Status DeleteDBFile(const ImmutableDBOptions* db_options, diff --git a/file/file_util.h b/file/file_util.h index 2c91718eeb6..d06d9c28672 100644 --- a/file/file_util.h +++ b/file/file_util.h @@ -20,27 +20,25 @@ namespace ROCKSDB_NAMESPACE { // use_fsync maps to options.use_fsync, which determines the way that // the file is synced after copying. -extern IOStatus CopyFile(FileSystem* fs, const std::string& source, - std::unique_ptr& dest_writer, - uint64_t size, bool use_fsync, - const std::shared_ptr& io_tracer, - const Temperature temperature); -extern IOStatus CopyFile(FileSystem* fs, const std::string& source, - const std::string& destination, uint64_t size, - bool use_fsync, - const std::shared_ptr& io_tracer, - const Temperature temperature); +IOStatus CopyFile(FileSystem* fs, const std::string& source, + Temperature src_temp_hint, + std::unique_ptr& dest_writer, + uint64_t size, bool use_fsync, + const std::shared_ptr& io_tracer); +IOStatus CopyFile(FileSystem* fs, const std::string& source, + Temperature src_temp_hint, const std::string& destination, + Temperature dst_temp, uint64_t size, bool use_fsync, + const std::shared_ptr& io_tracer); inline IOStatus CopyFile(const std::shared_ptr& fs, - const std::string& source, - const std::string& destination, uint64_t size, - bool use_fsync, - const std::shared_ptr& io_tracer, - const Temperature temperature) { - return CopyFile(fs.get(), source, destination, size, use_fsync, io_tracer, - temperature); + const std::string& source, Temperature src_temp_hint, + const std::string& destination, Temperature dst_temp, + uint64_t size, bool use_fsync, + const std::shared_ptr& io_tracer) { + return CopyFile(fs.get(), source, src_temp_hint, destination, dst_temp, size, + use_fsync, io_tracer); } -extern IOStatus CreateFile(FileSystem* fs, const std::string& destination, - const std::string& contents, bool use_fsync); +IOStatus CreateFile(FileSystem* fs, const std::string& destination, + const std::string& contents, bool use_fsync); inline IOStatus CreateFile(const std::shared_ptr& fs, const std::string& destination, @@ -48,13 +46,12 @@ inline IOStatus CreateFile(const std::shared_ptr& fs, return CreateFile(fs.get(), destination, contents, use_fsync); } -extern Status DeleteDBFile(const ImmutableDBOptions* db_options, - const std::string& fname, - const std::string& path_to_sync, const bool force_bg, - const bool force_fg); +Status DeleteDBFile(const ImmutableDBOptions* db_options, + const std::string& fname, const std::string& path_to_sync, + const bool force_bg, const bool force_fg); // TODO(hx235): pass the whole DBOptions intead of its individual fields -extern IOStatus GenerateOneFileChecksum( +IOStatus GenerateOneFileChecksum( FileSystem* fs, const std::string& file_path, FileChecksumGenFactory* checksum_factory, const std::string& requested_checksum_func_name, std::string* file_checksum, @@ -93,6 +90,14 @@ inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, return IOStatus::OK(); } +inline IOStatus PrepareIOFromWriteOptions(const WriteOptions& wo, + IOOptions& opts) { + opts.rate_limiter_priority = wo.rate_limiter_priority; + opts.io_activity = wo.io_activity; + + return IOStatus::OK(); +} + // Test method to delete the input directory and all of its contents. // This method is destructive and is meant for use only in tests!!! Status DestroyDir(Env* env, const std::string& dir); diff --git a/file/filename.cc b/file/filename.cc index 1e04c73395e..b34a0e113e8 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -8,14 +8,15 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "file/filename.h" -#include -#include - +#include #include +#include #include +#include "file/file_util.h" #include "file/writable_file_writer.h" #include "rocksdb/env.h" +#include "rocksdb/file_system.h" #include "test_util/sync_point.h" #include "util/stop_watch.h" #include "util/string_util.h" @@ -385,8 +386,8 @@ bool ParseFileName(const std::string& fname, uint64_t* number, return true; } -IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, - uint64_t descriptor_number, +IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, + const std::string& dbname, uint64_t descriptor_number, FSDirectory* dir_contains_current_file) { // Remove leading "dbname/" and add newline to manifest file name std::string manifest = DescriptorFileName(dbname, descriptor_number); @@ -394,21 +395,25 @@ IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, assert(contents.starts_with(dbname + "/")); contents.remove_prefix(dbname.size() + 1); std::string tmp = TempFileName(dbname, descriptor_number); - IOStatus s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true); + IOOptions opts; + IOStatus s = PrepareIOFromWriteOptions(write_options, opts); + if (s.ok()) { + s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts); + } TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s); if (s.ok()) { TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:0", REDUCE_ODDS2); - s = fs->RenameFile(tmp, CurrentFileName(dbname), IOOptions(), nullptr); + s = fs->RenameFile(tmp, CurrentFileName(dbname), opts, nullptr); TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:1", REDUCE_ODDS2); TEST_SYNC_POINT_CALLBACK("SetCurrentFile:AfterRename", &s); } if (s.ok()) { if (dir_contains_current_file != nullptr) { s = dir_contains_current_file->FsyncWithDirOptions( - IOOptions(), nullptr, DirFsyncOptions(CurrentFileName(dbname))); + opts, nullptr, DirFsyncOptions(CurrentFileName(dbname))); } } else { - fs->DeleteFile(tmp, IOOptions(), nullptr) + fs->DeleteFile(tmp, opts, nullptr) .PermitUncheckedError(); // NOTE: PermitUncheckedError is acceptable // here as we are already handling an error // case, and this is just a best-attempt @@ -417,8 +422,8 @@ IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, return s; } -Status SetIdentityFile(Env* env, const std::string& dbname, - const std::string& db_id) { +Status SetIdentityFile(const WriteOptions& write_options, Env* env, + const std::string& dbname, const std::string& db_id) { std::string id; if (db_id.empty()) { id = env->GenerateUniqueId(); @@ -429,17 +434,21 @@ Status SetIdentityFile(Env* env, const std::string& dbname, // Reserve the filename dbname/000000.dbtmp for the temporary identity file std::string tmp = TempFileName(dbname, 0); std::string identify_file_name = IdentityFileName(dbname); - Status s = WriteStringToFile(env, id, tmp, true); + Status s; + IOOptions opts; + s = PrepareIOFromWriteOptions(write_options, opts); + if (s.ok()) { + s = WriteStringToFile(env, id, tmp, true, &opts); + } if (s.ok()) { s = env->RenameFile(tmp, identify_file_name); } std::unique_ptr dir_obj; if (s.ok()) { - s = env->GetFileSystem()->NewDirectory(dbname, IOOptions(), &dir_obj, - nullptr); + s = env->GetFileSystem()->NewDirectory(dbname, opts, &dir_obj, nullptr); } if (s.ok()) { - s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr, + s = dir_obj->FsyncWithDirOptions(opts, nullptr, DirFsyncOptions(identify_file_name)); } @@ -447,7 +456,7 @@ Status SetIdentityFile(Env* env, const std::string& dbname, // if it is not impelmented. Detailed explanations can be found in // db/db_impl/db_impl.h if (s.ok()) { - Status temp_s = dir_obj->Close(IOOptions(), nullptr); + Status temp_s = dir_obj->Close(opts, nullptr); if (!temp_s.ok()) { if (temp_s.IsNotSupported()) { temp_s.PermitUncheckedError(); @@ -463,10 +472,16 @@ Status SetIdentityFile(Env* env, const std::string& dbname, } IOStatus SyncManifest(const ImmutableDBOptions* db_options, + const WriteOptions& write_options, WritableFileWriter* file) { TEST_KILL_RANDOM_WITH_WEIGHT("SyncManifest:0", REDUCE_ODDS2); StopWatch sw(db_options->clock, db_options->stats, MANIFEST_FILE_SYNC_MICROS); - return file->Sync(db_options->use_fsync); + IOOptions io_options; + IOStatus s = PrepareIOFromWriteOptions(write_options, io_options); + if (!s.ok()) { + return s; + } + return file->Sync(io_options, db_options->use_fsync); } Status GetInfoLogFiles(const std::shared_ptr& fs, diff --git a/file/filename.h b/file/filename.h index 2eb125b6a17..56bbd78d555 100644 --- a/file/filename.h +++ b/file/filename.h @@ -40,69 +40,68 @@ constexpr char kFilePathSeparator = '/'; // Return the name of the log file with the specified number // in the db named by "dbname". The result will be prefixed with // "dbname". -extern std::string LogFileName(const std::string& dbname, uint64_t number); +std::string LogFileName(const std::string& dbname, uint64_t number); -extern std::string LogFileName(uint64_t number); +std::string LogFileName(uint64_t number); -extern std::string BlobFileName(uint64_t number); +std::string BlobFileName(uint64_t number); -extern std::string BlobFileName(const std::string& bdirname, uint64_t number); +std::string BlobFileName(const std::string& bdirname, uint64_t number); -extern std::string BlobFileName(const std::string& dbname, - const std::string& blob_dir, uint64_t number); +std::string BlobFileName(const std::string& dbname, const std::string& blob_dir, + uint64_t number); -extern std::string ArchivalDirectory(const std::string& dbname); +std::string ArchivalDirectory(const std::string& dbname); // Return the name of the archived log file with the specified number // in the db named by "dbname". The result will be prefixed with "dbname". -extern std::string ArchivedLogFileName(const std::string& dbname, uint64_t num); +std::string ArchivedLogFileName(const std::string& dbname, uint64_t num); -extern std::string MakeTableFileName(const std::string& name, uint64_t number); +std::string MakeTableFileName(const std::string& name, uint64_t number); -extern std::string MakeTableFileName(uint64_t number); +std::string MakeTableFileName(uint64_t number); // Return the name of sstable with LevelDB suffix // created from RocksDB sstable suffixed name -extern std::string Rocks2LevelTableFileName(const std::string& fullname); +std::string Rocks2LevelTableFileName(const std::string& fullname); // the reverse function of MakeTableFileName // TODO(yhchiang): could merge this function with ParseFileName() -extern uint64_t TableFileNameToNumber(const std::string& name); +uint64_t TableFileNameToNumber(const std::string& name); // Return the name of the sstable with the specified number // in the db named by "dbname". The result will be prefixed with // "dbname". -extern std::string TableFileName(const std::vector& db_paths, - uint64_t number, uint32_t path_id); +std::string TableFileName(const std::vector& db_paths, uint64_t number, + uint32_t path_id); // Sufficient buffer size for FormatFileNumber. const size_t kFormatFileNumberBufSize = 38; -extern void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf, - size_t out_buf_size); +void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf, + size_t out_buf_size); // Return the name of the descriptor file for the db named by // "dbname" and the specified incarnation number. The result will be // prefixed with "dbname". -extern std::string DescriptorFileName(const std::string& dbname, - uint64_t number); +std::string DescriptorFileName(const std::string& dbname, uint64_t number); -extern std::string DescriptorFileName(uint64_t number); +std::string DescriptorFileName(uint64_t number); extern const std::string kCurrentFileName; // = "CURRENT" // Return the name of the current file. This file contains the name // of the current manifest file. The result will be prefixed with // "dbname". -extern std::string CurrentFileName(const std::string& dbname); +std::string CurrentFileName(const std::string& dbname); // Return the name of the lock file for the db named by // "dbname". The result will be prefixed with "dbname". -extern std::string LockFileName(const std::string& dbname); +std::string LockFileName(const std::string& dbname); // Return the name of a temporary file owned by the db named "dbname". // The result will be prefixed with "dbname". -extern std::string TempFileName(const std::string& dbname, uint64_t number); +std::string TempFileName(const std::string& dbname, uint64_t number); // A helper structure for prefix of info log names. struct InfoLogPrefix { @@ -115,74 +114,73 @@ struct InfoLogPrefix { }; // Return the name of the info log file for "dbname". -extern std::string InfoLogFileName(const std::string& dbname, - const std::string& db_path = "", - const std::string& log_dir = ""); +std::string InfoLogFileName(const std::string& dbname, + const std::string& db_path = "", + const std::string& log_dir = ""); // Return the name of the old info log file for "dbname". -extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, - const std::string& db_path = "", - const std::string& log_dir = ""); +std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, + const std::string& db_path = "", + const std::string& log_dir = ""); extern const std::string kOptionsFileNamePrefix; // = "OPTIONS-" extern const std::string kTempFileNameSuffix; // = "dbtmp" // Return a options file name given the "dbname" and file number. // Format: OPTIONS-[number].dbtmp -extern std::string OptionsFileName(const std::string& dbname, - uint64_t file_num); -extern std::string OptionsFileName(uint64_t file_num); +std::string OptionsFileName(const std::string& dbname, uint64_t file_num); +std::string OptionsFileName(uint64_t file_num); // Return a temp options file name given the "dbname" and file number. // Format: OPTIONS-[number] -extern std::string TempOptionsFileName(const std::string& dbname, - uint64_t file_num); +std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num); // Return the name to use for a metadatabase. The result will be prefixed with // "dbname". -extern std::string MetaDatabaseName(const std::string& dbname, uint64_t number); +std::string MetaDatabaseName(const std::string& dbname, uint64_t number); // Return the name of the Identity file which stores a unique number for the db // that will get regenerated if the db loses all its data and is recreated fresh // either from a backup-image or empty -extern std::string IdentityFileName(const std::string& dbname); +std::string IdentityFileName(const std::string& dbname); // If filename is a rocksdb file, store the type of the file in *type. // The number encoded in the filename is stored in *number. If the // filename was successfully parsed, returns true. Else return false. // info_log_name_prefix is the path of info logs. -extern bool ParseFileName(const std::string& filename, uint64_t* number, - const Slice& info_log_name_prefix, FileType* type, - WalFileType* log_type = nullptr); +bool ParseFileName(const std::string& filename, uint64_t* number, + const Slice& info_log_name_prefix, FileType* type, + WalFileType* log_type = nullptr); // Same as previous function, but skip info log files. -extern bool ParseFileName(const std::string& filename, uint64_t* number, - FileType* type, WalFileType* log_type = nullptr); +bool ParseFileName(const std::string& filename, uint64_t* number, + FileType* type, WalFileType* log_type = nullptr); // Make the CURRENT file point to the descriptor file with the // specified number. On its success and when dir_contains_current_file is not // nullptr, the function will fsync the directory containing the CURRENT file // when -extern IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, - uint64_t descriptor_number, - FSDirectory* dir_contains_current_file); +IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs, + const std::string& dbname, uint64_t descriptor_number, + FSDirectory* dir_contains_current_file); // Make the IDENTITY file for the db -extern Status SetIdentityFile(Env* env, const std::string& dbname, - const std::string& db_id = {}); +Status SetIdentityFile(const WriteOptions& write_options, Env* env, + const std::string& dbname, + const std::string& db_id = {}); // Sync manifest file `file`. -extern IOStatus SyncManifest(const ImmutableDBOptions* db_options, - WritableFileWriter* file); +IOStatus SyncManifest(const ImmutableDBOptions* db_options, + const WriteOptions& write_options, + WritableFileWriter* file); // Return list of file names of info logs in `file_names`. // The list only contains file name. The parent directory name is stored // in `parent_dir`. // `db_log_dir` should be the one as in options.db_log_dir -extern Status GetInfoLogFiles(const std::shared_ptr& fs, - const std::string& db_log_dir, - const std::string& dbname, - std::string* parent_dir, - std::vector* file_names); +Status GetInfoLogFiles(const std::shared_ptr& fs, + const std::string& db_log_dir, const std::string& dbname, + std::string* parent_dir, + std::vector* file_names); -extern std::string NormalizePath(const std::string& path); +std::string NormalizePath(const std::string& path); } // namespace ROCKSDB_NAMESPACE diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index 84932440a03..4087c9cc58d 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -1325,6 +1325,10 @@ TEST_P(PrefetchTest, PrefetchWithBlockLookupAutoTuneTest) { ropts.readahead_size = cmp_ro.readahead_size = 32768; } + if (std::get<1>(GetParam())) { + ropts.async_io = true; + } + // With and without tuning readahead_size. { ASSERT_OK(options.statistics->Reset()); @@ -1351,10 +1355,6 @@ TEST_P(PrefetchTest, PrefetchWithBlockLookupAutoTuneTest) { cmp_iter->Next(); } - uint64_t readahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - ASSERT_GT(readahead_trimmed, 0); - ASSERT_OK(cmp_iter->status()); ASSERT_OK(iter->status()); } @@ -1381,10 +1381,6 @@ TEST_P(PrefetchTest, PrefetchWithBlockLookupAutoTuneTest) { cmp_iter->Next(); } - uint64_t readahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - ASSERT_GT(readahead_trimmed, 0); - ASSERT_OK(cmp_iter->status()); ASSERT_OK(iter->status()); } @@ -1554,7 +1550,7 @@ TEST_P(PrefetchTest, DBIterLevelReadAhead) { SyncPoint::GetInstance()->SetCallBack( "BlockPrefetcher::SetReadaheadState", [&](void* arg) { readahead_carry_over_count++; - size_t readahead_size = *reinterpret_cast(arg); + size_t readahead_size = *static_cast(arg); if (readahead_carry_over_count) { ASSERT_GT(readahead_size, 8 * 1024); } @@ -1562,7 +1558,7 @@ TEST_P(PrefetchTest, DBIterLevelReadAhead) { SyncPoint::GetInstance()->SetCallBack( "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) { - current_readahead_size = *reinterpret_cast(arg); + current_readahead_size = *static_cast(arg); ASSERT_GT(current_readahead_size, 0); }); @@ -1641,7 +1637,6 @@ TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) { ASSERT_OK(Flush()); } MoveFilesToLevel(2); - int buff_async_prefetch_count = 0; int buff_prefetch_count = 0; int readahead_carry_over_count = 0; int num_sst_files = NumTableFilesAtLevel(2); @@ -1654,10 +1649,6 @@ TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) { "FilePrefetchBuffer::Prefetch:Start", [&](void*) { buff_prefetch_count++; }); - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_async_prefetch_count++; }); - SyncPoint::GetInstance()->SetCallBack( "UpdateResults::io_uring_result", [&](void* /*arg*/) { read_async_called = true; }); @@ -1668,7 +1659,7 @@ TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) { SyncPoint::GetInstance()->SetCallBack( "BlockPrefetcher::SetReadaheadState", [&](void* arg) { readahead_carry_over_count++; - size_t readahead_size = *reinterpret_cast(arg); + size_t readahead_size = *static_cast(arg); if (readahead_carry_over_count) { ASSERT_GT(readahead_size, 8 * 1024); } @@ -1676,7 +1667,7 @@ TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) { SyncPoint::GetInstance()->SetCallBack( "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) { - current_readahead_size = *reinterpret_cast(arg); + current_readahead_size = *static_cast(arg); ASSERT_GT(current_readahead_size, 0); }); @@ -1713,7 +1704,7 @@ TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) { // Not all platforms support iouring. In that case, ReadAsync in posix // won't submit async requests. if (read_async_called) { - ASSERT_GT(buff_async_prefetch_count, 0); + ASSERT_GT(buff_prefetch_count, 0); ASSERT_GT(async_read_bytes.count, 0); } else { ASSERT_GT(buff_prefetch_count, 0); @@ -1957,7 +1948,7 @@ TEST_P(PrefetchTest1, SeekWithExtraPrefetchAsyncIO) { ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); Close(); - + int buff_prefetch_count = 0, extra_prefetch_buff_cnt = 0; for (size_t i = 0; i < 3; i++) { table_options.num_file_reads_for_auto_readahead = i; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -1965,14 +1956,14 @@ TEST_P(PrefetchTest1, SeekWithExtraPrefetchAsyncIO) { s = TryReopen(options); ASSERT_OK(s); - int buff_prefetch_count = 0; - int extra_prefetch_buff_cnt = 0; + buff_prefetch_count = 0; + extra_prefetch_buff_cnt = 0; SyncPoint::GetInstance()->SetCallBack( "FilePrefetchBuffer::PrefetchAsync:ExtraPrefetching", [&](void*) { extra_prefetch_buff_cnt++; }); SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", + "FilePrefetchBuffer::Prefetch:Start", [&](void*) { buff_prefetch_count++; }); SyncPoint::GetInstance()->EnableProcessing(); @@ -2066,7 +2057,7 @@ TEST_P(PrefetchTest1, NonSequentialReadsWithAdaptiveReadahead) { [&](void* /*arg*/) { set_readahead++; }); SyncPoint::GetInstance()->SetCallBack( "FilePrefetchBuffer::TryReadFromCache", - [&](void* arg) { readahead_size = *reinterpret_cast(arg); }); + [&](void* arg) { readahead_size = *static_cast(arg); }); SyncPoint::GetInstance()->EnableProcessing(); @@ -2161,9 +2152,8 @@ TEST_P(PrefetchTest1, DecreaseReadAheadIfInCache) { SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", [&](void*) { buff_prefetch_count++; }); SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) { - current_readahead_size = *reinterpret_cast(arg); - }); + "FilePrefetchBuffer::TryReadFromCache", + [&](void* arg) { current_readahead_size = *static_cast(arg); }); SyncPoint::GetInstance()->EnableProcessing(); ReadOptions ro; @@ -2291,12 +2281,6 @@ TEST_P(PrefetchTest1, SeekParallelizationTest) { ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); int buff_prefetch_count = 0; - int buff_prefetch_async_count = 0; - - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_prefetch_async_count++; }); - SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", [&](void*) { buff_prefetch_count++; }); @@ -2343,7 +2327,7 @@ TEST_P(PrefetchTest1, SeekParallelizationTest) { // not all platforms support io_uring. In that case it'll fallback to // normal prefetching without async_io. if (read_async_called) { - ASSERT_EQ(buff_prefetch_async_count, 2); + ASSERT_EQ(buff_prefetch_count, 2); ASSERT_GT(async_read_bytes.count, 0); ASSERT_GT(get_perf_context()->number_async_seek, 0); } else { @@ -2353,314 +2337,6 @@ TEST_P(PrefetchTest1, SeekParallelizationTest) { Close(); } -// This test checks if readahead_size is trimmed when upper_bound is reached. -// It tests with different combinations of async_io disabled/enabled, -// readahead_size (implicit and explicit), and num_file_reads_for_auto_readahead -// from 0 to 2. -TEST_P(PrefetchTest, IterReadAheadSizeWithUpperBound) { - if (mem_env_ || encrypted_env_) { - ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); - return; - } - - // First param is if the mockFS support_prefetch or not - std::shared_ptr fs = - std::make_shared(FileSystem::Default(), false); - - std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - Options options; - SetGenericOptions(env.get(), /*use_direct_io=*/false, options); - options.statistics = CreateDBStatistics(); - BlockBasedTableOptions table_options; - SetBlockBasedTableOptions(table_options); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - Status s = TryReopen(options); - ASSERT_OK(s); - - Random rnd(309); - WriteBatch batch; - - for (int i = 0; i < 26; i++) { - std::string key = "my_key_"; - - for (int j = 0; j < 10; j++) { - key += char('a' + i); - ASSERT_OK(batch.Put(key, rnd.RandomString(1000))); - } - } - ASSERT_OK(db_->Write(WriteOptions(), &batch)); - - std::string start_key = "my_key_a"; - - std::string end_key = "my_key_"; - for (int j = 0; j < 10; j++) { - end_key += char('a' + 25); - } - - Slice least(start_key.data(), start_key.size()); - Slice greatest(end_key.data(), end_key.size()); - - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); - - int buff_prefetch_count = 0; - - // Try with different num_file_reads_for_auto_readahead from 0 to 3. - for (size_t i = 0; i < 3; i++) { - table_options.num_file_reads_for_auto_readahead = i; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - s = TryReopen(options); - ASSERT_OK(s); - - int buff_count_with_tuning = 0, buff_count_without_tuning = 0; - int keys_with_tuning = 0, keys_without_tuning = 0; - int reseek_keys_with_tuning = 0, reseek_keys_without_tuning = 0; - buff_prefetch_count = 0; - - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::Prefetch:Start", - [&](void*) { buff_prefetch_count++; }); - - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_prefetch_count++; }); - - SyncPoint::GetInstance()->EnableProcessing(); - - ReadOptions ropts; - if (std::get<0>(GetParam())) { - ropts.readahead_size = 32768; - } - if (std::get<1>(GetParam())) { - ropts.async_io = true; - } - - // With tuning readahead_size. - { - ASSERT_OK(options.statistics->Reset()); - Slice ub = Slice("my_key_uuu"); - Slice* ub_ptr = &ub; - ropts.iterate_upper_bound = ub_ptr; - ropts.auto_readahead_size = true; - - auto iter = std::unique_ptr(db_->NewIterator(ropts)); - - // Seek. - { - Slice seek_key = Slice("my_key_aaa"); - iter->Seek(seek_key); - - while (iter->Valid()) { - keys_with_tuning++; - iter->Next(); - } - - uint64_t readahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - ASSERT_GT(readahead_trimmed, 0); - buff_count_with_tuning = buff_prefetch_count; - } - - // Reseek with new upper_bound_iterator. - { - ub = Slice("my_key_y"); - Slice reseek_key = Slice("my_key_v"); - iter->Seek(reseek_key); - - while (iter->Valid()) { - iter->Next(); - reseek_keys_with_tuning++; - } - ASSERT_OK(iter->status()); - - uint64_t readahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - ASSERT_GT(readahead_trimmed, 0); - ASSERT_GT(reseek_keys_with_tuning, 0); - } - } - - // Without tuning readahead_size - { - Slice ub = Slice("my_key_uuu"); - Slice* ub_ptr = &ub; - ropts.iterate_upper_bound = ub_ptr; - buff_prefetch_count = 0; - ASSERT_OK(options.statistics->Reset()); - ropts.auto_readahead_size = false; - - auto iter = std::unique_ptr(db_->NewIterator(ropts)); - - // Seek. - { - Slice seek_key = Slice("my_key_aaa"); - iter->Seek(seek_key); - - while (iter->Valid()) { - keys_without_tuning++; - iter->Next(); - } - buff_count_without_tuning = buff_prefetch_count; - uint64_t readahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - ASSERT_EQ(readahead_trimmed, 0); - } - - // Reseek with new upper_bound_iterator. - { - ub = Slice("my_key_y"); - Slice reseek_key = Slice("my_key_v"); - iter->Seek(reseek_key); - - while (iter->Valid()) { - iter->Next(); - reseek_keys_without_tuning++; - } - ASSERT_OK(iter->status()); - - uint64_t readahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - ASSERT_EQ(readahead_trimmed, 0); - ASSERT_GT(reseek_keys_without_tuning, 0); - } - } - - { - // Verify results with and without tuning. - if (std::get<1>(GetParam())) { - // In case of async_io. - ASSERT_GE(buff_count_with_tuning, buff_count_without_tuning); - } else { - ASSERT_EQ(buff_count_without_tuning, buff_count_with_tuning); - } - // Prefetching should happen. - ASSERT_GT(buff_count_without_tuning, 0); - ASSERT_GT(buff_count_with_tuning, 0); - // No of keys should be equal. - ASSERT_EQ(keys_without_tuning, keys_with_tuning); - // No of keys after reseek with new upper bound should be equal. - ASSERT_EQ(reseek_keys_without_tuning, reseek_keys_with_tuning); - } - Close(); - } -} - -// This test checks if readahead_size is trimmed when upper_bound is reached -// during Seek in async_io and it goes for polling without any extra -// prefetching. -TEST_P(PrefetchTest, IterReadAheadSizeWithUpperBoundSeekOnly) { - if (mem_env_ || encrypted_env_) { - ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); - return; - } - - // First param is if the mockFS support_prefetch or not - std::shared_ptr fs = - std::make_shared(FileSystem::Default(), false); - - bool use_direct_io = false; - if (std::get<0>(GetParam())) { - use_direct_io = true; - } - - std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - Options options; - SetGenericOptions(env.get(), use_direct_io, options); - options.statistics = CreateDBStatistics(); - BlockBasedTableOptions table_options; - SetBlockBasedTableOptions(table_options); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - Status s = TryReopen(options); - if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { - // If direct IO is not supported, skip the test - return; - } else { - ASSERT_OK(s); - } - - Random rnd(309); - WriteBatch batch; - - for (int i = 0; i < 26; i++) { - std::string key = "my_key_"; - - for (int j = 0; j < 10; j++) { - key += char('a' + i); - ASSERT_OK(batch.Put(key, rnd.RandomString(1000))); - } - } - ASSERT_OK(db_->Write(WriteOptions(), &batch)); - - std::string start_key = "my_key_a"; - - std::string end_key = "my_key_"; - for (int j = 0; j < 10; j++) { - end_key += char('a' + 25); - } - - Slice least(start_key.data(), start_key.size()); - Slice greatest(end_key.data(), end_key.size()); - - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); - - s = TryReopen(options); - ASSERT_OK(s); - - int buff_count_with_tuning = 0; - - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_count_with_tuning++; }); - - bool read_async_called = false; - SyncPoint::GetInstance()->SetCallBack( - "UpdateResults::io_uring_result", - [&](void* /*arg*/) { read_async_called = true; }); - - SyncPoint::GetInstance()->EnableProcessing(); - - SyncPoint::GetInstance()->EnableProcessing(); - - ReadOptions ropts; - if (std::get<1>(GetParam())) { - ropts.readahead_size = 32768; - } - ropts.async_io = true; - - Slice ub = Slice("my_key_aaa"); - ropts.iterate_upper_bound = &ub; - Slice seek_key = Slice("my_key_aaa"); - - // With tuning readahead_size. - { - ASSERT_OK(options.statistics->Reset()); - ropts.auto_readahead_size = true; - - auto iter = std::unique_ptr(db_->NewIterator(ropts)); - - iter->Seek(seek_key); - - ASSERT_OK(iter->status()); - - // Verify results. - uint64_t readhahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - // Readahead got trimmed. - if (read_async_called) { - ASSERT_GT(readhahead_trimmed, 0); - // Seek called PrefetchAsync to poll the data. - ASSERT_EQ(1, buff_count_with_tuning); - } else { - // async_io disabled. - ASSERT_GE(readhahead_trimmed, 0); - ASSERT_EQ(0, buff_count_with_tuning); - } - } - Close(); -} - namespace { #ifdef GFLAGS const int kMaxArgCount = 100; @@ -2743,9 +2419,8 @@ TEST_P(PrefetchTest, ReadAsyncWithPosixFS) { ro.readahead_size = 16 * 1024; } - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); SyncPoint::GetInstance()->SetCallBack( "UpdateResults::io_uring_result", @@ -2767,7 +2442,6 @@ TEST_P(PrefetchTest, ReadAsyncWithPosixFS) { if (read_async_called) { ASSERT_EQ(num_keys, total_keys); - ASSERT_GT(buff_prefetch_count, 0); // Check stats to make sure async prefetch is done. HistogramData async_read_bytes; options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); @@ -2781,8 +2455,8 @@ TEST_P(PrefetchTest, ReadAsyncWithPosixFS) { // Not all platforms support iouring. In that case, ReadAsync in posix // won't submit async requests. ASSERT_EQ(num_keys, total_keys); - ASSERT_EQ(buff_prefetch_count, 0); } + ASSERT_GT(buff_prefetch_count, 0); } SyncPoint::GetInstance()->DisableProcessing(); @@ -2870,9 +2544,8 @@ TEST_P(PrefetchTest, MultipleSeekWithPosixFS) { ro.readahead_size = 16 * 1024; } - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); SyncPoint::GetInstance()->SetCallBack( "UpdateResults::io_uring_result", @@ -2998,9 +2671,8 @@ TEST_P(PrefetchTest, SeekParallelizationTestWithPosix) { int buff_prefetch_count = 0; - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); bool read_async_called = false; SyncPoint::GetInstance()->SetCallBack( @@ -3120,9 +2792,8 @@ TEST_P(PrefetchTest, TraceReadAsyncWithCallbackWrapper) { ro.readahead_size = 16 * 1024; } - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); SyncPoint::GetInstance()->SetCallBack( "UpdateResults::io_uring_result", @@ -3239,7 +2910,11 @@ TEST_F(FilePrefetchBufferTest, SeekWithBlockCacheHit) { std::unique_ptr r; Read(fname, opts, &r); - FilePrefetchBuffer fpb(16384, 16384, true, false, false, 0, 0, 0, fs()); + ReadaheadParams readahead_params; + readahead_params.initial_readahead_size = 16384; + readahead_params.max_readahead_size = 16384; + + FilePrefetchBuffer fpb(readahead_params, true, false, fs()); Slice result; // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings, // it will do two reads of 4096+8192 and 8192 @@ -3257,14 +2932,13 @@ TEST_F(FilePrefetchBufferTest, SeekWithBlockCacheHit) { // 16384 IOOptions io_opts; io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW; - ASSERT_TRUE( - fpb.TryReadFromCacheAsync(io_opts, r.get(), 8192, 8192, &result, &s)); + ASSERT_TRUE(fpb.TryReadFromCache(io_opts, r.get(), 8192, 8192, &result, &s)); } // Test to ensure when PrefetchAsync is called during seek, it doesn't do any // alignment or prefetch extra if readahead is not enabled during seek. TEST_F(FilePrefetchBufferTest, SeekWithoutAlignment) { - std::string fname = "seek-wwithout-alignment"; + std::string fname = "seek-without-alignment"; Random rand(0); std::string content = rand.RandomString(32768); Write(fname, content); @@ -3285,11 +2959,16 @@ TEST_F(FilePrefetchBufferTest, SeekWithoutAlignment) { // Without readahead enabled, there will be no alignment and offset of buffer // will be n. { - FilePrefetchBuffer fpb( - /*readahead_size=*/8192, /*max_readahead_size=*/16384, /*enable=*/true, - /*track_min_offset=*/false, /*implicit_auto_readahead=*/true, - /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/2, - /*upper_bound_offset=*/0, fs()); + ReadaheadParams readahead_params; + readahead_params.initial_readahead_size = 8192; + readahead_params.max_readahead_size = 16384; + readahead_params.implicit_auto_readahead = true; + readahead_params.num_file_reads_for_auto_readahead = 2; + readahead_params.num_buffers = 2; + + FilePrefetchBuffer fpb(readahead_params, /*enable=*/true, + /*track_min_offset=*/false, fs(), nullptr, nullptr, + nullptr, FilePrefetchBufferUsage::kUnknown); Slice result; // Simulate a seek of half of alignment bytes at offset n. Due to the @@ -3306,7 +2985,7 @@ TEST_F(FilePrefetchBufferTest, SeekWithoutAlignment) { IOOptions io_opts; io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW; - ASSERT_TRUE(fpb.TryReadFromCacheAsync(io_opts, r.get(), n, n, &result, &s)); + ASSERT_TRUE(fpb.TryReadFromCache(io_opts, r.get(), n, n, &result, &s)); if (read_async_called) { ASSERT_EQ(fpb.GetPrefetchOffset(), n); @@ -3317,11 +2996,14 @@ TEST_F(FilePrefetchBufferTest, SeekWithoutAlignment) { // buffer will be 0. { read_async_called = false; - FilePrefetchBuffer fpb( - /*readahead_size=*/16384, /*max_readahead_size=*/16384, /*enable=*/true, - /*track_min_offset=*/false, /*implicit_auto_readahead=*/false, - /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/2, - /*upper_bound_offset=*/0, fs()); + ReadaheadParams readahead_params; + readahead_params.initial_readahead_size = 16384; + readahead_params.max_readahead_size = 16384; + readahead_params.num_file_reads_for_auto_readahead = 2; + readahead_params.num_buffers = 2; + FilePrefetchBuffer fpb(readahead_params, /*enable=*/true, + /*track_min_offset=*/false, fs(), nullptr, nullptr, + nullptr, FilePrefetchBufferUsage::kUnknown); Slice result; // Simulate a seek of half of alignment bytes at offset n. @@ -3336,7 +3018,7 @@ TEST_F(FilePrefetchBufferTest, SeekWithoutAlignment) { IOOptions io_opts; io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW; - ASSERT_TRUE(fpb.TryReadFromCacheAsync(io_opts, r.get(), n, n, &result, &s)); + ASSERT_TRUE(fpb.TryReadFromCache(io_opts, r.get(), n, n, &result, &s)); if (read_async_called) { ASSERT_EQ(fpb.GetPrefetchOffset(), 0); @@ -3354,11 +3036,13 @@ TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) { std::unique_ptr r; Read(fname, opts, &r); - FilePrefetchBuffer fpb( - /*readahead_size=*/8192, /*max_readahead_size=*/16384, /*enable=*/true, - /*track_min_offset=*/false, /*implicit_auto_readahead=*/false, - /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, - /*upper_bound_offset=*/0, fs()); + ReadaheadParams readahead_params; + readahead_params.initial_readahead_size = 8192; + readahead_params.max_readahead_size = 16384; + readahead_params.num_buffers = 2; + FilePrefetchBuffer fpb(readahead_params, /*enable=*/true, + /*track_min_offset=*/false, fs(), nullptr, nullptr, + nullptr, FilePrefetchBufferUsage::kUnknown); int read_async_called = 0; SyncPoint::GetInstance()->SetCallBack( @@ -3379,8 +3063,8 @@ TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) { ASSERT_TRUE(s.IsTryAgain()); IOOptions io_opts; io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW; - ASSERT_TRUE(fpb.TryReadFromCacheAsync(io_opts, r.get(), /*offset=*/3000, - /*length=*/4000, &async_result, &s)); + ASSERT_TRUE(fpb.TryReadFromCache(io_opts, r.get(), /*offset=*/3000, + /*length=*/4000, &async_result, &s)); // No sync call should be made. HistogramData sst_read_micros; stats()->histogramData(SST_READ_MICROS, &sst_read_micros); @@ -3396,63 +3080,6 @@ TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) { ASSERT_EQ(result, async_result); } -// This test checks if during seek in async_io, if first buffer already -// prefetched the data till upper_bound offset, second buffer shouldn't go for -// prefetching. -TEST_F(FilePrefetchBufferTest, IterateUpperBoundTest1) { - std::string fname = "iterate-upperbound-test1"; - Random rand(0); - std::string content = rand.RandomString(32768); - Write(fname, content); - - FileOptions opts; - std::unique_ptr r; - Read(fname, opts, &r); - - FilePrefetchBuffer fpb( - /*readahead_size=*/8192, /*max_readahead_size=*/16384, /*enable=*/true, - /*track_min_offset=*/false, /*implicit_auto_readahead=*/false, - /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, - /*upper_bound_offset=*/8000, fs()); - - int read_async_called = 0; - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::ReadAsync", - [&](void* /*arg*/) { read_async_called++; }); - SyncPoint::GetInstance()->EnableProcessing(); - - Slice async_result; - // Simulate a seek of 4000 bytes at offset 3000. Due to the readahead - // settings, it will do 1 read of 4000+1000 (till 8000 - upper bound). - Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 3000, 4000, &async_result); - - // Platforms that don't have IO uring may not support async IO - if (s.IsNotSupported()) { - return; - } - - ASSERT_TRUE(s.IsTryAgain()); - IOOptions io_opts; - io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW; - ASSERT_TRUE(fpb.TryReadFromCacheAsync(io_opts, r.get(), /*offset=*/3000, - /*length=*/4000, &async_result, &s)); - // No sync call should be made. - HistogramData sst_read_micros; - stats()->histogramData(SST_READ_MICROS, &sst_read_micros); - ASSERT_EQ(sst_read_micros.count, 0); - - // Number of async calls should be 1. - // No Prefetching should happen in second buffer as first buffer has already - // prefetched till offset. - ASSERT_EQ(read_async_called, 1); - // Length should be 4000. - ASSERT_EQ(async_result.size(), 4000); - // Data correctness. - Slice result(&content[3000], 4000); - ASSERT_EQ(result.size(), 4000); - ASSERT_EQ(result, async_result); -} - TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) { std::string fname = "seek-with-block-cache-hit"; Random rand(0); @@ -3464,11 +3091,14 @@ TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) { Read(fname, opts, &r); std::shared_ptr stats = CreateDBStatistics(); - FilePrefetchBuffer fpb(8192, 8192, true, false, false, 0, 0, 0, fs(), nullptr, + ReadaheadParams readahead_params; + readahead_params.initial_readahead_size = 8192; + readahead_params.max_readahead_size = 8192; + FilePrefetchBuffer fpb(readahead_params, true, false, fs(), nullptr, stats.get()); Slice result; // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings, - // it will do two reads of 4096+8192 and 8192 + // it will do a read of offset 0 and length - (4096 + 8192) 12288. Status s; ASSERT_TRUE(fpb.TryReadFromCache(IOOptions(), r.get(), 0, 4096, &result, &s)); ASSERT_EQ(s, Status::OK()); @@ -3477,8 +3107,8 @@ TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) { // Simulate a block cache hit fpb.UpdateReadPattern(4096, 4096, false); - // Now read some data that straddles the two prefetch buffers - offset 8192 to - // 16384 + // Now read some data that'll prefetch additional data from 12288 to 24576. + // (8192) + 8192 (readahead_size). ASSERT_TRUE( fpb.TryReadFromCache(IOOptions(), r.get(), 8192, 8192, &result, &s)); ASSERT_EQ(s, Status::OK()); @@ -3488,8 +3118,19 @@ TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) { ASSERT_TRUE( fpb.TryReadFromCache(IOOptions(), r.get(), 12288, 4096, &result, &s)); ASSERT_EQ(s, Status::OK()); - ASSERT_EQ(stats->getTickerCount(PREFETCH_HITS), 1); - ASSERT_EQ(stats->getTickerCount(PREFETCH_BYTES_USEFUL), 8192); + ASSERT_EQ(stats->getAndResetTickerCount(PREFETCH_HITS), 1); + ASSERT_EQ(stats->getAndResetTickerCount(PREFETCH_BYTES_USEFUL), 8192); + + // Now read some data with length doesn't align with aligment and it needs + // prefetching. Read from 16000 with length 10000 (i.e. requested end offset - + // 26000). + ASSERT_TRUE( + fpb.TryReadFromCache(IOOptions(), r.get(), 16000, 10000, &result, &s)); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(stats->getAndResetTickerCount(PREFETCH_HITS), 0); + ASSERT_EQ( + stats->getAndResetTickerCount(PREFETCH_BYTES_USEFUL), + /* 24576(end offset of the buffer) - 16000(requested offset) =*/8576); } } // namespace ROCKSDB_NAMESPACE diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc index 8c07b8333ef..92381e79015 100644 --- a/file/random_access_file_reader.cc +++ b/file/random_access_file_reader.cc @@ -422,7 +422,8 @@ IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts, remaining_bytes -= request_bytes; } } - io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr); + io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, + /*IODebugContext*=*/nullptr); RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_fs_reqs); } @@ -463,7 +464,6 @@ IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts, file_name(), read_reqs[i].result.size(), read_reqs[i].offset); } - RecordIOStats(stats_, file_temperature_, is_last_level_, read_reqs[i].result.size()); } @@ -487,15 +487,16 @@ IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro, IOStatus RandomAccessFileReader::ReadAsync( FSReadRequest& req, const IOOptions& opts, - std::function cb, void* cb_arg, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf) { IOStatus s; // Create a callback and populate info. auto read_async_callback = std::bind(&RandomAccessFileReader::ReadAsyncCallback, this, std::placeholders::_1, std::placeholders::_2); - ReadAsyncInfo* read_async_info = - new ReadAsyncInfo(cb, cb_arg, clock_->NowMicros()); + + ReadAsyncInfo* read_async_info = new ReadAsyncInfo( + cb, cb_arg, (clock_ != nullptr ? clock_->NowMicros() : 0)); if (ShouldNotifyListeners()) { read_async_info->fs_start_ts_ = FileOperationInfo::StartNow(); @@ -557,7 +558,7 @@ IOStatus RandomAccessFileReader::ReadAsync( return s; } -void RandomAccessFileReader::ReadAsyncCallback(const FSReadRequest& req, +void RandomAccessFileReader::ReadAsyncCallback(FSReadRequest& req, void* cb_arg) { ReadAsyncInfo* read_async_info = static_cast(cb_arg); assert(read_async_info); diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h index b52bfead164..f1efa8abf61 100644 --- a/file/random_access_file_reader.h +++ b/file/random_access_file_reader.h @@ -92,8 +92,8 @@ class RandomAccessFileReader { const bool is_last_level_; struct ReadAsyncInfo { - ReadAsyncInfo(std::function cb, - void* cb_arg, uint64_t start_time) + ReadAsyncInfo(std::function cb, void* cb_arg, + uint64_t start_time) : cb_(cb), cb_arg_(cb_arg), start_time_(start_time), @@ -103,7 +103,7 @@ class RandomAccessFileReader { user_len_(0), is_aligned_(false) {} - std::function cb_; + std::function cb_; void* cb_arg_; uint64_t start_time_; FileOperationInfo::StartTimePoint fs_start_ts_; @@ -189,11 +189,9 @@ class RandomAccessFileReader { IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts) const; IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, - std::function cb, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf); - - void ReadAsyncCallback(const FSReadRequest& req, void* cb_arg); // RocksDB-Cloud contribution begin IOStatus MultiReadAsync( @@ -205,5 +203,7 @@ class RandomAccessFileReader { // Callback for non-directIO MultiReadAsync. void MultiReadAsyncCallback(const FSReadRequest*, size_t, void*); // RocksDB-Cloud contribution end + + void ReadAsyncCallback(FSReadRequest& req, void* cb_arg); }; } // namespace ROCKSDB_NAMESPACE diff --git a/file/random_access_file_reader_test.cc b/file/random_access_file_reader_test.cc index 6b7b7eb68ce..f081795b9d1 100644 --- a/file/random_access_file_reader_test.cc +++ b/file/random_access_file_reader_test.cc @@ -425,7 +425,9 @@ TEST(FSReadRequest, TryMerge) { src.scratch = nullptr; ASSERT_OK(src.status); - if (reverse) std::swap(dest, src); + if (reverse) { + std::swap(dest, src); + } ASSERT_TRUE(TryMerge(&dest, src)); ASSERT_EQ(dest.offset, 0); ASSERT_EQ(dest.len, 10); @@ -448,7 +450,9 @@ TEST(FSReadRequest, TryMerge) { src.scratch = nullptr; ASSERT_OK(src.status); - if (reverse) std::swap(dest, src); + if (reverse) { + std::swap(dest, src); + } ASSERT_TRUE(TryMerge(&dest, src)); ASSERT_EQ(dest.offset, 0); ASSERT_EQ(dest.len, 10); @@ -471,7 +475,9 @@ TEST(FSReadRequest, TryMerge) { src.scratch = nullptr; ASSERT_OK(src.status); - if (reverse) std::swap(dest, src); + if (reverse) { + std::swap(dest, src); + } ASSERT_TRUE(TryMerge(&dest, src)); ASSERT_EQ(dest.offset, 0); ASSERT_EQ(dest.len, 10); diff --git a/file/read_write_util.h b/file/read_write_util.h index 9f034b705f1..edcddd55570 100644 --- a/file/read_write_util.h +++ b/file/read_write_util.h @@ -21,9 +21,9 @@ namespace ROCKSDB_NAMESPACE { // fname : the file name. // result : output arg. A WritableFile based on `fname` returned. // options : the Env Options. -extern IOStatus NewWritableFile(FileSystem* fs, const std::string& fname, - std::unique_ptr* result, - const FileOptions& options); +IOStatus NewWritableFile(FileSystem* fs, const std::string& fname, + std::unique_ptr* result, + const FileOptions& options); #ifndef NDEBUG bool IsFileSectorAligned(const size_t off, size_t sector_size); diff --git a/file/sequence_file_reader.cc b/file/sequence_file_reader.cc index a753c1d098c..ac2f37b0d0d 100644 --- a/file/sequence_file_reader.cc +++ b/file/sequence_file_reader.cc @@ -16,6 +16,7 @@ #include "monitoring/histogram.h" #include "monitoring/iostats_context_imp.h" #include "port/port.h" +#include "rocksdb/file_system.h" #include "test_util/sync_point.h" #include "util/aligned_buffer.h" #include "util/random.h" @@ -38,6 +39,8 @@ IOStatus SequentialFileReader::Create( IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch, Env::IOPriority rate_limiter_priority) { IOStatus io_s; + IOOptions io_opts; + io_opts.rate_limiter_priority = rate_limiter_priority; if (use_direct_io()) { // // |-offset_advance-|---bytes returned--| @@ -76,7 +79,7 @@ IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch, start_ts = FileOperationInfo::StartNow(); } io_s = file_->PositionedRead(aligned_offset + buf.CurrentSize(), allowed, - IOOptions(), &tmp, buf.Destination(), + io_opts, &tmp, buf.Destination(), nullptr /* dbg */); if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); @@ -119,7 +122,7 @@ IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch, start_ts = FileOperationInfo::StartNow(); } Slice tmp; - io_s = file_->Read(allowed, IOOptions(), &tmp, scratch + read, + io_s = file_->Read(allowed, io_opts, &tmp, scratch + read, nullptr /* dbg */); if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); diff --git a/file/sst_file_manager_impl.h b/file/sst_file_manager_impl.h index 24f056dcc4e..d3a40c8b639 100644 --- a/file/sst_file_manager_impl.h +++ b/file/sst_file_manager_impl.h @@ -88,16 +88,16 @@ class SstFileManagerImpl : public SstFileManager { std::unordered_map GetTrackedFiles() override; // Return delete rate limit in bytes per second. - virtual int64_t GetDeleteRateBytesPerSecond() override; + int64_t GetDeleteRateBytesPerSecond() override; // Update the delete rate limit in bytes per second. - virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) override; + void SetDeleteRateBytesPerSecond(int64_t delete_rate) override; // Return trash/DB size ratio where new files will be deleted immediately - virtual double GetMaxTrashDBRatio() override; + double GetMaxTrashDBRatio() override; // Update trash/DB size ratio where new files will be deleted immediately - virtual void SetMaxTrashDBRatio(double ratio) override; + void SetMaxTrashDBRatio(double ratio) override; // Return the total size of trash files uint64_t GetTotalTrashSize() override; diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc index 908878a5fae..4fadf1d71a3 100644 --- a/file/writable_file_writer.cc +++ b/file/writable_file_writer.cc @@ -13,6 +13,7 @@ #include #include "db/version_edit.h" +#include "file/file_util.h" #include "monitoring/histogram.h" #include "monitoring/iostats_context_imp.h" #include "port/port.h" @@ -24,6 +25,24 @@ #include "util/rate_limiter_impl.h" namespace ROCKSDB_NAMESPACE { +inline Histograms GetFileWriteHistograms(Histograms file_writer_hist, + Env::IOActivity io_activity) { + if (file_writer_hist == Histograms::SST_WRITE_MICROS || + file_writer_hist == Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS) { + switch (io_activity) { + case Env::IOActivity::kFlush: + return Histograms::FILE_WRITE_FLUSH_MICROS; + case Env::IOActivity::kCompaction: + return Histograms::FILE_WRITE_COMPACTION_MICROS; + case Env::IOActivity::kDBOpen: + return Histograms::FILE_WRITE_DB_OPEN_MICROS; + default: + break; + } + } + return Histograms::HISTOGRAM_ENUM_MAX; +} + IOStatus WritableFileWriter::Create(const std::shared_ptr& fs, const std::string& fname, const FileOptions& file_opts, @@ -42,12 +61,16 @@ IOStatus WritableFileWriter::Create(const std::shared_ptr& fs, return io_s; } -IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, - Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::Append(const IOOptions& opts, const Slice& data, + uint32_t crc32c_checksum) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } + StopWatch sw(clock_, stats_, hist_type_, + GetFileWriteHistograms(hist_type_, opts.io_activity)); + + const IOOptions io_options = FinalizeIOOptions(opts); const char* src = data.data(); size_t left = data.size(); IOStatus s; @@ -59,10 +82,6 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, UpdateFileChecksum(data); { - IOOptions io_options; - io_options.rate_limiter_priority = - WritableFileWriter::DecideRateLimiterPriority( - writable_file_->GetIOPriority(), op_rate_limiter_priority); IOSTATS_TIMER_GUARD(prepare_write_nanos); TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite"); writable_file_->PrepareWrite(static_cast(GetFileSize()), left, @@ -88,7 +107,7 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, // Flush only when buffered I/O if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) { if (buf_.CurrentSize() > 0) { - s = Flush(op_rate_limiter_priority); + s = Flush(io_options); if (!s.ok()) { set_seen_error(); return s; @@ -119,7 +138,7 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, src += appended; if (left > 0) { - s = Flush(op_rate_limiter_priority); + s = Flush(io_options); if (!s.ok()) { break; } @@ -129,7 +148,7 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, } else { assert(buf_.CurrentSize() == 0); buffered_data_crc32c_checksum_ = crc32c_checksum; - s = WriteBufferedWithChecksum(src, left, op_rate_limiter_priority); + s = WriteBufferedWithChecksum(io_options, src, left); } } else { // In this case, either we do not need to do the data verification or @@ -149,7 +168,7 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, src += appended; if (left > 0) { - s = Flush(op_rate_limiter_priority); + s = Flush(io_options); if (!s.ok()) { break; } @@ -160,9 +179,9 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, assert(buf_.CurrentSize() == 0); if (perform_data_verification_ && buffered_data_with_checksum_) { buffered_data_crc32c_checksum_ = crc32c::Value(src, left); - s = WriteBufferedWithChecksum(src, left, op_rate_limiter_priority); + s = WriteBufferedWithChecksum(io_options, src, left); } else { - s = WriteBuffered(src, left, op_rate_limiter_priority); + s = WriteBuffered(io_options, src, left); } } } @@ -177,11 +196,12 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, return s; } -IOStatus WritableFileWriter::Pad(const size_t pad_bytes, - Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::Pad(const IOOptions& opts, + const size_t pad_bytes) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } + const IOOptions io_options = FinalizeIOOptions(opts); assert(pad_bytes < kDefaultPageSize); size_t left = pad_bytes; size_t cap = buf_.Capacity() - buf_.CurrentSize(); @@ -195,7 +215,7 @@ IOStatus WritableFileWriter::Pad(const size_t pad_bytes, buf_.PadWith(append_bytes, 0); left -= append_bytes; if (left > 0) { - IOStatus s = Flush(op_rate_limiter_priority); + IOStatus s = Flush(io_options); if (!s.ok()) { set_seen_error(); return s; @@ -214,11 +234,12 @@ IOStatus WritableFileWriter::Pad(const size_t pad_bytes, return IOStatus::OK(); } -IOStatus WritableFileWriter::Close() { +IOStatus WritableFileWriter::Close(const IOOptions& opts) { + IOOptions io_options = FinalizeIOOptions(opts); if (seen_error()) { IOStatus interim; if (writable_file_.get() != nullptr) { - interim = writable_file_->Close(IOOptions(), nullptr); + interim = writable_file_->Close(io_options, nullptr); writable_file_.reset(); } if (interim.ok()) { @@ -240,11 +261,9 @@ IOStatus WritableFileWriter::Close() { } IOStatus s; - s = Flush(); // flush cache to OS + s = Flush(io_options); // flush cache to OS IOStatus interim; - IOOptions io_options; - io_options.rate_limiter_priority = writable_file_->GetIOPriority(); // In direct I/O mode we write whole pages so // we need to let the file know where data ends. if (use_direct_io()) { @@ -322,11 +341,13 @@ IOStatus WritableFileWriter::Close() { // write out the cached data to the OS cache or storage if direct I/O // enabled -IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::Flush(const IOOptions& opts) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } + const IOOptions io_options = FinalizeIOOptions(opts); + IOStatus s; TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Flush:0", REDUCE_ODDS2); @@ -334,18 +355,17 @@ IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { if (use_direct_io()) { if (pending_sync_) { if (perform_data_verification_ && buffered_data_with_checksum_) { - s = WriteDirectWithChecksum(op_rate_limiter_priority); + s = WriteDirectWithChecksum(io_options); } else { - s = WriteDirect(op_rate_limiter_priority); + s = WriteDirect(io_options); } } } else { if (perform_data_verification_ && buffered_data_with_checksum_) { - s = WriteBufferedWithChecksum(buf_.BufferStart(), buf_.CurrentSize(), - op_rate_limiter_priority); + s = WriteBufferedWithChecksum(io_options, buf_.BufferStart(), + buf_.CurrentSize()); } else { - s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize(), - op_rate_limiter_priority); + s = WriteBuffered(io_options, buf_.BufferStart(), buf_.CurrentSize()); } } if (!s.ok()) { @@ -359,10 +379,6 @@ IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } - IOOptions io_options; - io_options.rate_limiter_priority = - WritableFileWriter::DecideRateLimiterPriority( - writable_file_->GetIOPriority(), op_rate_limiter_priority); s = writable_file_->Flush(io_options, nullptr); if (ShouldNotifyListeners()) { auto finish_ts = std::chrono::steady_clock::now(); @@ -400,7 +416,8 @@ IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { assert(offset_sync_to >= last_sync_size_); if (offset_sync_to > 0 && offset_sync_to - last_sync_size_ >= bytes_per_sync_) { - s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_); + s = RangeSync(io_options, last_sync_size_, + offset_sync_to - last_sync_size_); if (!s.ok()) { set_seen_error(); } @@ -429,19 +446,25 @@ const char* WritableFileWriter::GetFileChecksumFuncName() const { } } -IOStatus WritableFileWriter::Sync(bool use_fsync) { +IOStatus WritableFileWriter::PrepareIOOptions(const WriteOptions& wo, + IOOptions& opts) { + return PrepareIOFromWriteOptions(wo, opts); +} + +IOStatus WritableFileWriter::Sync(const IOOptions& opts, bool use_fsync) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } - IOStatus s = Flush(); + IOOptions io_options = FinalizeIOOptions(opts); + IOStatus s = Flush(io_options); if (!s.ok()) { set_seen_error(); return s; } TEST_KILL_RANDOM("WritableFileWriter::Sync:0"); if (!use_direct_io() && pending_sync_) { - s = SyncInternal(use_fsync); + s = SyncInternal(io_options, use_fsync); if (!s.ok()) { set_seen_error(); return s; @@ -452,17 +475,19 @@ IOStatus WritableFileWriter::Sync(bool use_fsync) { return IOStatus::OK(); } -IOStatus WritableFileWriter::SyncWithoutFlush(bool use_fsync) { +IOStatus WritableFileWriter::SyncWithoutFlush(const IOOptions& opts, + bool use_fsync) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } + IOOptions io_options = FinalizeIOOptions(opts); if (!writable_file_->IsSyncThreadSafe()) { return IOStatus::NotSupported( "Can't WritableFileWriter::SyncWithoutFlush() because " "WritableFile::IsSyncThreadSafe() is false"); } TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1"); - IOStatus s = SyncInternal(use_fsync); + IOStatus s = SyncInternal(io_options, use_fsync); TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2"); if (!s.ok()) { #ifndef NDEBUG @@ -473,7 +498,8 @@ IOStatus WritableFileWriter::SyncWithoutFlush(bool use_fsync) { return s; } -IOStatus WritableFileWriter::SyncInternal(bool use_fsync) { +IOStatus WritableFileWriter::SyncInternal(const IOOptions& opts, + bool use_fsync) { // Caller is supposed to check seen_error_ IOStatus s; IOSTATS_TIMER_GUARD(fsync_nanos); @@ -487,12 +513,10 @@ IOStatus WritableFileWriter::SyncInternal(bool use_fsync) { start_ts = FileOperationInfo::StartNow(); } - IOOptions io_options; - io_options.rate_limiter_priority = writable_file_->GetIOPriority(); if (use_fsync) { - s = writable_file_->Fsync(io_options, nullptr); + s = writable_file_->Fsync(opts, nullptr); } else { - s = writable_file_->Sync(io_options, nullptr); + s = writable_file_->Sync(opts, nullptr); } if (ShouldNotifyListeners()) { auto finish_ts = std::chrono::steady_clock::now(); @@ -511,7 +535,8 @@ IOStatus WritableFileWriter::SyncInternal(bool use_fsync) { return s; } -IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { +IOStatus WritableFileWriter::RangeSync(const IOOptions& opts, uint64_t offset, + uint64_t nbytes) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } @@ -522,9 +547,7 @@ IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } - IOOptions io_options; - io_options.rate_limiter_priority = writable_file_->GetIOPriority(); - IOStatus s = writable_file_->RangeSync(offset, nbytes, io_options, nullptr); + IOStatus s = writable_file_->RangeSync(offset, nbytes, opts, nullptr); if (!s.ok()) { set_seen_error(); } @@ -541,8 +564,8 @@ IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { // This method writes to disk the specified data and makes use of the rate // limiter if available -IOStatus WritableFileWriter::WriteBuffered( - const char* data, size_t size, Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::WriteBuffered(const IOOptions& opts, + const char* data, size_t size) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } @@ -553,11 +576,7 @@ IOStatus WritableFileWriter::WriteBuffered( size_t left = size; DataVerificationInfo v_info; char checksum_buf[sizeof(uint32_t)]; - Env::IOPriority rate_limiter_priority_used = - WritableFileWriter::DecideRateLimiterPriority( - writable_file_->GetIOPriority(), op_rate_limiter_priority); - IOOptions io_options; - io_options.rate_limiter_priority = rate_limiter_priority_used; + Env::IOPriority rate_limiter_priority_used = opts.rate_limiter_priority; while (left > 0) { size_t allowed = left; @@ -573,7 +592,7 @@ IOStatus WritableFileWriter::WriteBuffered( TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); FileOperationInfo::StartTimePoint start_ts; - uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr); + uint64_t old_size = writable_file_->GetFileSize(opts, nullptr); if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); old_size = next_write_offset_; @@ -585,10 +604,10 @@ IOStatus WritableFileWriter::WriteBuffered( if (perform_data_verification_) { Crc32cHandoffChecksumCalculation(src, allowed, checksum_buf); v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); - s = writable_file_->Append(Slice(src, allowed), io_options, v_info, + s = writable_file_->Append(Slice(src, allowed), opts, v_info, nullptr); } else { - s = writable_file_->Append(Slice(src, allowed), io_options, nullptr); + s = writable_file_->Append(Slice(src, allowed), opts, nullptr); } if (!s.ok()) { // If writable_file_->Append() failed, then the data may or may not @@ -635,8 +654,9 @@ IOStatus WritableFileWriter::WriteBuffered( return s; } -IOStatus WritableFileWriter::WriteBufferedWithChecksum( - const char* data, size_t size, Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::WriteBufferedWithChecksum(const IOOptions& opts, + const char* data, + size_t size) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } @@ -648,11 +668,7 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum( size_t left = size; DataVerificationInfo v_info; char checksum_buf[sizeof(uint32_t)]; - Env::IOPriority rate_limiter_priority_used = - WritableFileWriter::DecideRateLimiterPriority( - writable_file_->GetIOPriority(), op_rate_limiter_priority); - IOOptions io_options; - io_options.rate_limiter_priority = rate_limiter_priority_used; + Env::IOPriority rate_limiter_priority_used = opts.rate_limiter_priority; // Check how much is allowed. Here, we loop until the rate limiter allows to // write the entire buffer. // TODO: need to be improved since it sort of defeats the purpose of the rate @@ -673,7 +689,7 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum( TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); FileOperationInfo::StartTimePoint start_ts; - uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr); + uint64_t old_size = writable_file_->GetFileSize(opts, nullptr); if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); old_size = next_write_offset_; @@ -685,7 +701,7 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum( EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_); v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); - s = writable_file_->Append(Slice(src, left), io_options, v_info, nullptr); + s = writable_file_->Append(Slice(src, left), opts, v_info, nullptr); SetPerfLevel(prev_perf_level); } if (ShouldNotifyListeners()) { @@ -755,8 +771,7 @@ void WritableFileWriter::Crc32cHandoffChecksumCalculation(const char* data, // whole number of pages to be written again on the next flush because we can // only write on aligned // offsets. -IOStatus WritableFileWriter::WriteDirect( - Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::WriteDirect(const IOOptions& opts) { if (seen_error()) { assert(false); @@ -785,11 +800,7 @@ IOStatus WritableFileWriter::WriteDirect( size_t left = buf_.CurrentSize(); DataVerificationInfo v_info; char checksum_buf[sizeof(uint32_t)]; - Env::IOPriority rate_limiter_priority_used = - WritableFileWriter::DecideRateLimiterPriority( - writable_file_->GetIOPriority(), op_rate_limiter_priority); - IOOptions io_options; - io_options.rate_limiter_priority = rate_limiter_priority_used; + Env::IOPriority rate_limiter_priority_used = opts.rate_limiter_priority; while (left > 0) { // Check how much is allowed @@ -813,10 +824,10 @@ IOStatus WritableFileWriter::WriteDirect( Crc32cHandoffChecksumCalculation(src, size, checksum_buf); v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); s = writable_file_->PositionedAppend(Slice(src, size), write_offset, - io_options, v_info, nullptr); + opts, v_info, nullptr); } else { s = writable_file_->PositionedAppend(Slice(src, size), write_offset, - io_options, nullptr); + opts, nullptr); } if (ShouldNotifyListeners()) { @@ -859,8 +870,7 @@ IOStatus WritableFileWriter::WriteDirect( return s; } -IOStatus WritableFileWriter::WriteDirectWithChecksum( - Env::IOPriority op_rate_limiter_priority) { +IOStatus WritableFileWriter::WriteDirectWithChecksum(const IOOptions& opts) { if (seen_error()) { return AssertFalseAndGetStatusForPrevError(); } @@ -895,11 +905,7 @@ IOStatus WritableFileWriter::WriteDirectWithChecksum( DataVerificationInfo v_info; char checksum_buf[sizeof(uint32_t)]; - Env::IOPriority rate_limiter_priority_used = - WritableFileWriter::DecideRateLimiterPriority( - writable_file_->GetIOPriority(), op_rate_limiter_priority); - IOOptions io_options; - io_options.rate_limiter_priority = rate_limiter_priority_used; + Env::IOPriority rate_limiter_priority_used = opts.rate_limiter_priority; // Check how much is allowed. Here, we loop until the rate limiter allows to // write the entire buffer. // TODO: need to be improved since it sort of defeats the purpose of the rate @@ -925,8 +931,8 @@ IOStatus WritableFileWriter::WriteDirectWithChecksum( // direct writes must be positional EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_); v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); - s = writable_file_->PositionedAppend(Slice(src, left), write_offset, - io_options, v_info, nullptr); + s = writable_file_->PositionedAppend(Slice(src, left), write_offset, opts, + v_info, nullptr); if (ShouldNotifyListeners()) { auto finish_ts = std::chrono::steady_clock::now(); @@ -986,4 +992,14 @@ Env::IOPriority WritableFileWriter::DecideRateLimiterPriority( } } +IOOptions WritableFileWriter::FinalizeIOOptions(const IOOptions& opts) const { + Env::IOPriority op_rate_limiter_priority = opts.rate_limiter_priority; + IOOptions io_options(opts); + if (writable_file_.get() != nullptr) { + io_options.rate_limiter_priority = + WritableFileWriter::DecideRateLimiterPriority( + writable_file_->GetIOPriority(), op_rate_limiter_priority); + } + return io_options; +} } // namespace ROCKSDB_NAMESPACE diff --git a/file/writable_file_writer.h b/file/writable_file_writer.h index aac0f59491e..6b71cfa64c6 100644 --- a/file/writable_file_writer.h +++ b/file/writable_file_writer.h @@ -13,6 +13,7 @@ #include "db/version_edit.h" #include "env/file_system_tracer.h" +#include "monitoring/thread_status_util.h" #include "port/port.h" #include "rocksdb/file_checksum.h" #include "rocksdb/file_system.h" @@ -159,6 +160,7 @@ class WritableFileWriter { uint64_t bytes_per_sync_; RateLimiter* rate_limiter_; Statistics* stats_; + Histograms hist_type_; std::vector> listeners_; std::unique_ptr checksum_generator_; bool checksum_finalized_; @@ -173,6 +175,7 @@ class WritableFileWriter { const FileOptions& options, SystemClock* clock = nullptr, const std::shared_ptr& io_tracer = nullptr, Statistics* stats = nullptr, + Histograms hist_type = Histograms::HISTOGRAM_ENUM_MAX, const std::vector>& listeners = {}, FileChecksumGenFactory* file_checksum_gen_factory = nullptr, bool perform_data_verification = false, @@ -191,6 +194,7 @@ class WritableFileWriter { bytes_per_sync_(options.bytes_per_sync), rate_limiter_(options.rate_limiter), stats_(stats), + hist_type_(hist_type), listeners_(), checksum_generator_(nullptr), checksum_finalized_(false), @@ -222,35 +226,42 @@ class WritableFileWriter { const std::string& fname, const FileOptions& file_opts, std::unique_ptr* writer, IODebugContext* dbg); + + static IOStatus PrepareIOOptions(const WriteOptions& wo, IOOptions& opts); + WritableFileWriter(const WritableFileWriter&) = delete; WritableFileWriter& operator=(const WritableFileWriter&) = delete; ~WritableFileWriter() { - auto s = Close(); + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_UNKNOWN); + auto s = Close(IOOptions()); s.PermitUncheckedError(); + ThreadStatusUtil::SetThreadOperation(cur_op_type); } std::string file_name() const { return file_name_; } // When this Append API is called, if the crc32c_checksum is not provided, we // will calculate the checksum internally. - IOStatus Append(const Slice& data, uint32_t crc32c_checksum = 0, - Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL); + IOStatus Append(const IOOptions& opts, const Slice& data, + uint32_t crc32c_checksum = 0); - IOStatus Pad(const size_t pad_bytes, - Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL); + IOStatus Pad(const IOOptions& opts, const size_t pad_bytes); - IOStatus Flush(Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL); + IOStatus Flush(const IOOptions& opts); - IOStatus Close(); + IOStatus Close(const IOOptions& opts); - IOStatus Sync(bool use_fsync); + IOStatus Sync(const IOOptions& opts, bool use_fsync); // Sync only the data that was already Flush()ed. Safe to call concurrently // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(), // returns NotSupported status. - IOStatus SyncWithoutFlush(bool use_fsync); + IOStatus SyncWithoutFlush(const IOOptions& opts, bool use_fsync); uint64_t GetFileSize() const { return filesize_.load(std::memory_order_acquire); @@ -307,14 +318,20 @@ class WritableFileWriter { // Used when os buffering is OFF and we are writing // DMA such as in Direct I/O mode - IOStatus WriteDirect(Env::IOPriority op_rate_limiter_priority); - IOStatus WriteDirectWithChecksum(Env::IOPriority op_rate_limiter_priority); + // `opts` should've been called with `FinalizeIOOptions()` before passing in + IOStatus WriteDirect(const IOOptions& opts); + // `opts` should've been called with `FinalizeIOOptions()` before passing in + IOStatus WriteDirectWithChecksum(const IOOptions& opts); // Normal write. - IOStatus WriteBuffered(const char* data, size_t size, - Env::IOPriority op_rate_limiter_priority); - IOStatus WriteBufferedWithChecksum(const char* data, size_t size, - Env::IOPriority op_rate_limiter_priority); - IOStatus RangeSync(uint64_t offset, uint64_t nbytes); - IOStatus SyncInternal(bool use_fsync); + // `opts` should've been called with `FinalizeIOOptions()` before passing in + IOStatus WriteBuffered(const IOOptions& opts, const char* data, size_t size); + // `opts` should've been called with `FinalizeIOOptions()` before passing in + IOStatus WriteBufferedWithChecksum(const IOOptions& opts, const char* data, + size_t size); + // `opts` should've been called with `FinalizeIOOptions()` before passing in + IOStatus RangeSync(const IOOptions& opts, uint64_t offset, uint64_t nbytes); + // `opts` should've been called with `FinalizeIOOptions()` before passing in + IOStatus SyncInternal(const IOOptions& opts, bool use_fsync); + IOOptions FinalizeIOOptions(const IOOptions& opts) const; }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index e5ffe8944d6..fdf2af058de 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -51,6 +51,8 @@ enum CompactionPri : char { // First compact files whose ratio between overlapping size in next level // and its size is the smallest. It in many cases can optimize write // amplification. + // Files marked for compaction will be prioritized over files that are not + // marked. kMinOverlappingRatio = 0x3, // Keeps a cursor(s) of the successor of the file (key range) was/were // compacted before, and always picks the next files (key range) in that @@ -59,151 +61,6 @@ enum CompactionPri : char { kRoundRobin = 0x4, }; -// Compression options for different compression algorithms like Zlib -struct CompressionOptions { - // ==> BEGIN options that can be set by deprecated configuration syntax, <== - // ==> e.g. compression_opts=5:6:7:8:9:10:true:11:false <== - // ==> Please use compression_opts={level=6;strategy=7;} form instead. <== - - // RocksDB's generic default compression level. Internally it'll be translated - // to the default compression level specific to the library being used (see - // comment above `ColumnFamilyOptions::compression`). - // - // The default value is the max 16-bit int as it'll be written out in OPTIONS - // file, which should be portable. - static constexpr int kDefaultCompressionLevel = 32767; - - // zlib only: windowBits parameter. See https://www.zlib.net/manual.html - int window_bits = -14; - - // Compression "level" applicable to zstd, zlib, LZ4, and LZ4HC. Except for - // kDefaultCompressionLevel (see above), the meaning of each value depends - // on the compression algorithm. Decreasing across non- - // `kDefaultCompressionLevel` values will either favor speed over - // compression ratio or have no effect. - // - // In LZ4 specifically, the absolute value of a negative `level` internally - // configures the `acceleration` parameter. For example, set `level=-10` for - // `acceleration=10`. This negation is necessary to ensure decreasing `level` - // values favor speed over compression ratio. - int level = kDefaultCompressionLevel; - - // zlib only: strategy parameter. See https://www.zlib.net/manual.html - int strategy = 0; - - // Maximum size of dictionaries used to prime the compression library. - // Enabling dictionary can improve compression ratios when there are - // repetitions across data blocks. - // - // The dictionary is created by sampling the SST file data. If - // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's - // dictionary generator (see comments for option `use_zstd_dict_trainer` for - // detail on dictionary generator). If `zstd_max_train_bytes` is zero, the - // random samples are used directly as the dictionary. - // - // When compression dictionary is disabled, we compress and write each block - // before buffering data for the next one. When compression dictionary is - // enabled, we buffer SST file data in-memory so we can sample it, as data - // can only be compressed and written after the dictionary has been finalized. - // - // The amount of data buffered can be limited by `max_dict_buffer_bytes`. This - // buffered memory is charged to the block cache when there is a block cache. - // If block cache insertion fails with `Status::MemoryLimit` (i.e., it is - // full), we finalize the dictionary with whatever data we have and then stop - // buffering. - uint32_t max_dict_bytes = 0; - - // Maximum size of training data passed to zstd's dictionary trainer. Using - // zstd's dictionary trainer can achieve even better compression ratio - // improvements than using `max_dict_bytes` alone. - // - // The training data will be used to generate a dictionary of max_dict_bytes. - uint32_t zstd_max_train_bytes = 0; - - // Number of threads for parallel compression. - // Parallel compression is enabled only if threads > 1. - // THE FEATURE IS STILL EXPERIMENTAL - // - // This option is valid only when BlockBasedTable is used. - // - // When parallel compression is enabled, SST size file sizes might be - // more inflated compared to the target size, because more data of unknown - // compressed size is in flight when compression is parallelized. To be - // reasonably accurate, this inflation is also estimated by using historical - // compression ratio and current bytes inflight. - uint32_t parallel_threads = 1; - - // When the compression options are set by the user, it will be set to "true". - // For bottommost_compression_opts, to enable it, user must set enabled=true. - // Otherwise, bottommost compression will use compression_opts as default - // compression options. - // - // For compression_opts, if compression_opts.enabled=false, it is still - // used as compression options for compression process. - bool enabled = false; - - // Limit on data buffering when gathering samples to build a dictionary. Zero - // means no limit. When dictionary is disabled (`max_dict_bytes == 0`), - // enabling this limit (`max_dict_buffer_bytes != 0`) has no effect. - // - // In compaction, the buffering is limited to the target file size (see - // `target_file_size_base` and `target_file_size_multiplier`) even if this - // setting permits more buffering. Since we cannot determine where the file - // should be cut until data blocks are compressed with dictionary, buffering - // more than the target file size could lead to selecting samples that belong - // to a later output SST. - // - // Limiting too strictly may harm dictionary effectiveness since it forces - // RocksDB to pick samples from the initial portion of the output SST, which - // may not be representative of the whole file. Configuring this limit below - // `zstd_max_train_bytes` (when enabled) can restrict how many samples we can - // pass to the dictionary trainer. Configuring it below `max_dict_bytes` can - // restrict the size of the final dictionary. - uint64_t max_dict_buffer_bytes = 0; - - // Use zstd trainer to generate dictionaries. When this option is set to true, - // zstd_max_train_bytes of training data sampled from max_dict_buffer_bytes - // buffered data will be passed to zstd dictionary trainer to generate a - // dictionary of size max_dict_bytes. - // - // When this option is false, zstd's API ZDICT_finalizeDictionary() will be - // called to generate dictionaries. zstd_max_train_bytes of training sampled - // data will be passed to this API. Using this API should save CPU time on - // dictionary training, but the compression ratio may not be as good as using - // a dictionary trainer. - bool use_zstd_dict_trainer = true; - - // ===> END options that can be set by deprecated configuration syntax <=== - // ===> Use compression_opts={level=6;strategy=7;} form for below opts <=== - - // Essentially specifies a minimum acceptable compression ratio. A block is - // stored uncompressed if the compressed block does not achieve this ratio, - // because the downstream cost of decompression is not considered worth such - // a small savings (if any). - // However, the ratio is specified in a way that is efficient for checking. - // An integer from 1 to 1024 indicates the maximum allowable compressed bytes - // per 1KB of input, so the minimum acceptable ratio is 1024.0 / this value. - // For example, for a minimum ratio of 1.5:1, set to 683. See SetMinRatio(). - // Default: abandon use of compression for a specific block or entry if - // compressed by less than 12.5% (minimum ratio of 1.143:1). - int max_compressed_bytes_per_kb = 1024 * 7 / 8; - - // ZSTD only. - // Enable compression algorithm's checksum feature. - // (https://github.com/facebook/zstd/blob/d857369028d997c92ff1f1861a4d7f679a125464/lib/zstd.h#L428) - // Each compressed frame will have a 32-bit checksum attached. The checksum - // computed from the uncompressed data and can be verified during - // decompression. - bool checksum = false; - - // A convenience function for setting max_compressed_bytes_per_kb based on a - // minimum acceptable compression ratio (uncompressed size over compressed - // size). - void SetMinRatio(double min_ratio) { - max_compressed_bytes_per_kb = static_cast(1024.0 / min_ratio + 0.5); - } -}; - // Temperature of a file. Used to pass to FileSystem for a different // placement and/or coding. // Reserve some numbers in the middle, in case we need to insert new tier @@ -690,15 +547,6 @@ struct AdvancedColumnFamilyOptions { // Default: true bool level_compaction_dynamic_level_bytes = true; - // Allows RocksDB to generate files that are not exactly the target_file_size - // only for the non-bottommost files. Which can reduce the write-amplification - // from compaction. The file size could be from 0 to 2x target_file_size. - // Once enabled, non-bottommost compaction will try to cut the files align - // with the next level file boundaries (grandparent level). - // - // Default: true - bool level_compaction_dynamic_file_size = true; - // Default: 10. // // Dynamically changeable through SetOptions() API @@ -725,17 +573,6 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API uint64_t max_compaction_bytes = 0; - // When setting up compaction input files, we ignore the - // `max_compaction_bytes` limit when pulling in input files that are entirely - // within output key range. - // - // Default: true - // - // Dynamically changeable through SetOptions() API - // We could remove this knob and always ignore the limit once it is proven - // safe. - bool ignore_max_compaction_bytes_for_input = true; - // All writes will be slowed down to at least delayed_write_rate if estimated // bytes needed to be compaction exceed this threshold. // @@ -812,18 +649,29 @@ struct AdvancedColumnFamilyOptions { TablePropertiesCollectorFactories table_properties_collector_factories; // Maximum number of successive merge operations on a key in the memtable. + // It may be violated when filesystem reads would be needed to stay under the + // limit, unless `strict_max_successive_merges` is explicitly set. // // When a merge operation is added to the memtable and the maximum number of - // successive merges is reached, the value of the key will be calculated and - // inserted into the memtable instead of the merge operation. This will - // ensure that there are never more than max_successive_merges merge - // operations in the memtable. + // successive merges is reached, RocksDB will attempt to read the value. Upon + // success, the value will be inserted into the memtable instead of the merge + // operation. // // Default: 0 (disabled) // // Dynamically changeable through SetOptions() API size_t max_successive_merges = 0; + // Whether to allow filesystem reads to stay under the `max_successive_merges` + // limit. When true, this can lead to merge writes blocking the write path + // waiting on filesystem reads. + // + // This option is temporary in case the recent change to disallow filesystem + // reads during merge writes has a problem and users need to undo it quickly. + // + // Default: false + bool strict_max_successive_merges = false; + // This flag specifies that the implementation should optimize the filters // mainly for cases where keys are found rather than also optimize for keys // missed. This would be used in cases where the application knows that @@ -840,14 +688,6 @@ struct AdvancedColumnFamilyOptions { // Default: false bool optimize_filters_for_hits = false; - // During flush or compaction, check whether keys inserted to output files - // are in order. - // - // Default: true - // - // Dynamically changeable through SetOptions() API - bool check_flush_compaction_key_order = true; - // After writing every SST file, reopen it and read all the keys. // Checks the hash of all of the keys and values written versus the // keys in the file and signals a corruption if they do not match @@ -912,7 +752,9 @@ struct AdvancedColumnFamilyOptions { // // Leveled: files older than `periodic_compaction_seconds` will be picked up // for compaction and will be re-written to the same level as they were - // before. + // before if level_compaction_dynamic_level_bytes is disabled. Otherwise, + // it will rewrite files to the next level except for the last level files + // to the same level. // // FIFO: not supported. Setting this option has no effect for FIFO compaction. // @@ -961,27 +803,28 @@ struct AdvancedColumnFamilyOptions { uint64_t sample_for_compression = 0; // EXPERIMENTAL - // The feature is still in development and is incomplete. // If this option is set, when creating the last level files, pass this // temperature to FileSystem used. Should be no-op for default FileSystem // and users need to plug in their own FileSystem to take advantage of it. - // - // Note: the feature is changed from `bottommost_temperature` to - // `last_level_temperature` which now only apply for the last level files. - // The option name `bottommost_temperature` is kept only for migration, the - // behavior is the same as `last_level_temperature`. Please stop using - // `bottommost_temperature` and will be removed in next release. + // When using FIFO compaction, this option is ignored. // // Dynamically changeable through the SetOptions() API - Temperature bottommost_temperature = Temperature::kUnknown; Temperature last_level_temperature = Temperature::kUnknown; + // EXPERIMENTAL + // When no other option such as last_level_temperature determines the + // temperature of a new SST file, it will be written with this temperature, + // which can be set differently for each column family. + // + // Dynamically changeable through the SetOptions() API + Temperature default_write_temperature = Temperature::kUnknown; + // EXPERIMENTAL // When this field is set, all SST files without an explicitly set temperature // will be treated as if they have this temperature for file reading // accounting purpose, such as io statistics, io perf context. // - // Not dynamically changeable, change it requires db restart. + // Not dynamically changeable; change requires DB restart. Temperature default_temperature = Temperature::kUnknown; // EXPERIMENTAL diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 8a26585fe73..9de8770965f 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -608,6 +608,9 @@ extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* rocksdb_create_snapshot( extern ROCKSDB_LIBRARY_API void rocksdb_release_snapshot( rocksdb_t* db, const rocksdb_snapshot_t* snapshot); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_snapshot_get_sequence_number(const rocksdb_snapshot_t* snapshot); + /* Returns NULL if property name is unknown. Else returns a pointer to a malloc()-ed null-terminated value. */ extern ROCKSDB_LIBRARY_API char* rocksdb_property_value(rocksdb_t* db, @@ -691,8 +694,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_flush_wal(rocksdb_t* db, extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db, char** errptr); -extern ROCKSDB_LIBRARY_API void rocksdb_enable_file_deletions( - rocksdb_t* db, unsigned char force, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_enable_file_deletions(rocksdb_t* db, + char** errptr); /* Management operations */ @@ -1152,10 +1155,16 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*); +extern ROCKSDB_LIBRARY_API rocksdb_logger_t* rocksdb_options_get_info_log( + rocksdb_options_t* opt); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log_level( rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API int rocksdb_options_get_info_log_level( rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API rocksdb_logger_t* +rocksdb_logger_create_stderr_logger(int log_level, const char* prefix); +extern ROCKSDB_LIBRARY_API void rocksdb_logger_destroy( + rocksdb_logger_t* logger); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size( rocksdb_options_t*, size_t); extern ROCKSDB_LIBRARY_API size_t @@ -1261,6 +1270,9 @@ rocksdb_options_set_max_bytes_for_level_multiplier_additional( rocksdb_options_t*, int* level_values, size_t num_levels); extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics( rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_ttl(rocksdb_options_t*, + uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_options_get_ttl(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_periodic_compaction_seconds( rocksdb_options_t*, uint64_t); extern ROCKSDB_LIBRARY_API uint64_t @@ -1496,10 +1508,6 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_advise_random_on_open( rocksdb_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_advise_random_on_open(rocksdb_options_t*); -extern ROCKSDB_LIBRARY_API void -rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t*, int); -extern ROCKSDB_LIBRARY_API int -rocksdb_options_get_access_hint_on_compaction_start(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_adaptive_mutex( rocksdb_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_adaptive_mutex( @@ -1681,6 +1689,10 @@ extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create_auto_tuned(int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness); +extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* +rocksdb_ratelimiter_create_with_mode(int64_t rate_bytes_per_sec, + int64_t refill_period_us, int32_t fairness, + int mode, bool auto_tuned); extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy( rocksdb_ratelimiter_t*); diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index d3762b4a2e1..59805881f5e 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -295,6 +295,9 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions { // The compression method (if any) that is used to compress data. CompressionType compression_type = CompressionType::kLZ4Compression; + // Options specific to the compression algorithm + CompressionOptions compression_opts; + // compress_format_version can have two values: // compress_format_version == 1 -- decompressed size is not included in the // block header. @@ -380,9 +383,6 @@ inline std::shared_ptr NewCompressedSecondaryCache( // to find the appropriate balance automatically. // * Cache priorities are less aggressively enforced, which could cause // cache dilution from long range scans (unless they use fill_cache=false). -// * Can be worse for small caches, because if almost all of a cache shard is -// pinned (more likely with non-partitioned filters), then CLOCK eviction -// becomes very CPU intensive. // // See internal cache/clock_cache.h for full description. struct HyperClockCacheOptions : public ShardedCacheOptions { @@ -441,6 +441,43 @@ struct HyperClockCacheOptions : public ShardedCacheOptions { // load factor for efficient Lookup, Insert, etc. size_t min_avg_entry_charge = 450; + // A tuning parameter to cap eviction CPU usage in a "thrashing" situation + // by allowing the memory capacity to be exceeded slightly as needed. The + // default setting should offer balanced protection against excessive CPU + // and memory usage under extreme stress conditions, with no effect on + // normal operation. Such stress conditions are proportionally more likely + // with small caches (10s of MB or less) vs. large caches (GB-scale). + // (NOTE: With the unusual setting of strict_capacity_limit=true, this + // parameter is ignored.) + // + // BACKGROUND: Without some kind of limiter, inserting into a CLOCK-based + // cache with no evictable entries (all "pinned") requires scanning the + // entire cache to determine that nothing can be evicted. (By contrast, + // LRU caches can determine no entries are evictable in O(1) time, but + // require more synchronization/coordination on that eviction metadata.) + // This aspect of a CLOCK cache can make a stressed situation worse by + // bogging down the CPU with repeated scans of the cache. And with + // strict_capacity_limit=false (normal setting), finding something evictable + // doesn't change the outcome of insertion: the entry is inserted anyway + // and the cache is allowed to exceed its target capacity if necessary. + // + // SOLUTION: Eviction is aborted upon seeing some number of pinned + // entries before evicting anything, or if the ratio of pinned to evicted + // is too high. This setting `eviction_effort_cap` essentially controls both + // that allowed initial number of pinned entries and the maximum allowed + // ratio. As the pinned size approaches the target cache capacity, roughly + // 1/eviction_effort_cap additional portion of the capacity might be kept + // in memory and evictable in order to keep CLOCK eviction reasonably + // performant. Under the default setting and high stress conditions, this + // memory overhead is around 3-5%. Under normal or even moderate stress + // conditions, the memory overhead is negligible to zero. + // + // A large value like 1000 offers some protection with essentially no + // memory overhead, while the minimum value of 1 could be useful for a + // small cache where roughly doubling in size under stress could be OK to + // keep operations very fast. + int eviction_effort_cap = 30; + HyperClockCacheOptions( size_t _capacity, size_t _estimated_entry_charge, int _num_shard_bits = -1, bool _strict_capacity_limit = false, @@ -460,7 +497,7 @@ struct HyperClockCacheOptions : public ShardedCacheOptions { // has been removed. The new HyperClockCache requires an additional // configuration parameter that is not provided by this API. This function // simply returns a new LRUCache for functional compatibility. -extern std::shared_ptr NewClockCache( +std::shared_ptr NewClockCache( size_t capacity, int num_shard_bits = -1, bool strict_capacity_limit = false, CacheMetadataChargePolicy metadata_charge_policy = @@ -499,6 +536,10 @@ enum TieredAdmissionPolicy { // allocations costed to the block cache, will be distributed // proportionally across both the primary and secondary. struct TieredCacheOptions { + // This should point to an instance of either LRUCacheOptions or + // HyperClockCacheOptions, depending on the cache_type. In either + // case, the capacity and secondary_cache fields in those options + // should not be set. If set, they will be ignored by NewTieredCache. ShardedCacheOptions* cache_opts = nullptr; PrimaryCacheType cache_type = PrimaryCacheType::kCacheTypeLRU; TieredAdmissionPolicy adm_policy = TieredAdmissionPolicy::kAdmPolicyAuto; @@ -515,8 +556,7 @@ struct TieredCacheOptions { std::shared_ptr nvm_sec_cache; }; -extern std::shared_ptr NewTieredCache( - const TieredCacheOptions& cache_opts); +std::shared_ptr NewTieredCache(const TieredCacheOptions& cache_opts); // EXPERIMENTAL // Dynamically update some of the parameters of a TieredCache. The input @@ -527,7 +567,7 @@ extern std::shared_ptr NewTieredCache( // 2. Once the compressed secondary cache is disabled by setting the // compressed_secondary_ratio to 0.0, it cannot be dynamically re-enabled // again -extern Status UpdateTieredCache( +Status UpdateTieredCache( const std::shared_ptr& cache, int64_t total_capacity = -1, double compressed_secondary_ratio = std::numeric_limits::max(), TieredAdmissionPolicy adm_policy = TieredAdmissionPolicy::kAdmPolicyMax); diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h index 1784f2329ac..66f2f390e7d 100644 --- a/include/rocksdb/compaction_filter.h +++ b/include/rocksdb/compaction_filter.h @@ -368,7 +368,7 @@ class CompactionFilterFactory : public Customizable { const CompactionFilter::Context& context) = 0; // Returns a name that identifies this `CompactionFilter` factory. - virtual const char* Name() const override = 0; + const char* Name() const override = 0; }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/compression_type.h b/include/rocksdb/compression_type.h index bfeb00bdef0..1fe2fd3fc7c 100644 --- a/include/rocksdb/compression_type.h +++ b/include/rocksdb/compression_type.h @@ -6,6 +6,7 @@ #pragma once #include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { @@ -37,4 +38,149 @@ enum CompressionType : unsigned char { kDisableCompressionOption = 0xff, }; +// Compression options for different compression algorithms like Zlib +struct CompressionOptions { + // ==> BEGIN options that can be set by deprecated configuration syntax, <== + // ==> e.g. compression_opts=5:6:7:8:9:10:true:11:false <== + // ==> Please use compression_opts={level=6;strategy=7;} form instead. <== + + // RocksDB's generic default compression level. Internally it'll be translated + // to the default compression level specific to the library being used (see + // comment above `ColumnFamilyOptions::compression`). + // + // The default value is the max 16-bit int as it'll be written out in OPTIONS + // file, which should be portable. + static constexpr int kDefaultCompressionLevel = 32767; + + // zlib only: windowBits parameter. See https://www.zlib.net/manual.html + int window_bits = -14; + + // Compression "level" applicable to zstd, zlib, LZ4, and LZ4HC. Except for + // kDefaultCompressionLevel (see above), the meaning of each value depends + // on the compression algorithm. Decreasing across non- + // `kDefaultCompressionLevel` values will either favor speed over + // compression ratio or have no effect. + // + // In LZ4 specifically, the absolute value of a negative `level` internally + // configures the `acceleration` parameter. For example, set `level=-10` for + // `acceleration=10`. This negation is necessary to ensure decreasing `level` + // values favor speed over compression ratio. + int level = kDefaultCompressionLevel; + + // zlib only: strategy parameter. See https://www.zlib.net/manual.html + int strategy = 0; + + // Maximum size of dictionaries used to prime the compression library. + // Enabling dictionary can improve compression ratios when there are + // repetitions across data blocks. + // + // The dictionary is created by sampling the SST file data. If + // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's + // dictionary generator (see comments for option `use_zstd_dict_trainer` for + // detail on dictionary generator). If `zstd_max_train_bytes` is zero, the + // random samples are used directly as the dictionary. + // + // When compression dictionary is disabled, we compress and write each block + // before buffering data for the next one. When compression dictionary is + // enabled, we buffer SST file data in-memory so we can sample it, as data + // can only be compressed and written after the dictionary has been finalized. + // + // The amount of data buffered can be limited by `max_dict_buffer_bytes`. This + // buffered memory is charged to the block cache when there is a block cache. + // If block cache insertion fails with `Status::MemoryLimit` (i.e., it is + // full), we finalize the dictionary with whatever data we have and then stop + // buffering. + uint32_t max_dict_bytes = 0; + + // Maximum size of training data passed to zstd's dictionary trainer. Using + // zstd's dictionary trainer can achieve even better compression ratio + // improvements than using `max_dict_bytes` alone. + // + // The training data will be used to generate a dictionary of max_dict_bytes. + uint32_t zstd_max_train_bytes = 0; + + // Number of threads for parallel compression. + // Parallel compression is enabled only if threads > 1. + // THE FEATURE IS STILL EXPERIMENTAL + // + // This option is valid only when BlockBasedTable is used. + // + // When parallel compression is enabled, SST size file sizes might be + // more inflated compared to the target size, because more data of unknown + // compressed size is in flight when compression is parallelized. To be + // reasonably accurate, this inflation is also estimated by using historical + // compression ratio and current bytes inflight. + uint32_t parallel_threads = 1; + + // When the compression options are set by the user, it will be set to "true". + // For bottommost_compression_opts, to enable it, user must set enabled=true. + // Otherwise, bottommost compression will use compression_opts as default + // compression options. + // + // For compression_opts, if compression_opts.enabled=false, it is still + // used as compression options for compression process. + bool enabled = false; + + // Limit on data buffering when gathering samples to build a dictionary. Zero + // means no limit. When dictionary is disabled (`max_dict_bytes == 0`), + // enabling this limit (`max_dict_buffer_bytes != 0`) has no effect. + // + // In compaction, the buffering is limited to the target file size (see + // `target_file_size_base` and `target_file_size_multiplier`) even if this + // setting permits more buffering. Since we cannot determine where the file + // should be cut until data blocks are compressed with dictionary, buffering + // more than the target file size could lead to selecting samples that belong + // to a later output SST. + // + // Limiting too strictly may harm dictionary effectiveness since it forces + // RocksDB to pick samples from the initial portion of the output SST, which + // may not be representative of the whole file. Configuring this limit below + // `zstd_max_train_bytes` (when enabled) can restrict how many samples we can + // pass to the dictionary trainer. Configuring it below `max_dict_bytes` can + // restrict the size of the final dictionary. + uint64_t max_dict_buffer_bytes = 0; + + // Use zstd trainer to generate dictionaries. When this option is set to true, + // zstd_max_train_bytes of training data sampled from max_dict_buffer_bytes + // buffered data will be passed to zstd dictionary trainer to generate a + // dictionary of size max_dict_bytes. + // + // When this option is false, zstd's API ZDICT_finalizeDictionary() will be + // called to generate dictionaries. zstd_max_train_bytes of training sampled + // data will be passed to this API. Using this API should save CPU time on + // dictionary training, but the compression ratio may not be as good as using + // a dictionary trainer. + bool use_zstd_dict_trainer = true; + + // ===> END options that can be set by deprecated configuration syntax <=== + // ===> Use compression_opts={level=6;strategy=7;} form for below opts <=== + + // Essentially specifies a minimum acceptable compression ratio. A block is + // stored uncompressed if the compressed block does not achieve this ratio, + // because the downstream cost of decompression is not considered worth such + // a small savings (if any). + // However, the ratio is specified in a way that is efficient for checking. + // An integer from 1 to 1024 indicates the maximum allowable compressed bytes + // per 1KB of input, so the minimum acceptable ratio is 1024.0 / this value. + // For example, for a minimum ratio of 1.5:1, set to 683. See SetMinRatio(). + // Default: abandon use of compression for a specific block or entry if + // compressed by less than 12.5% (minimum ratio of 1.143:1). + int max_compressed_bytes_per_kb = 1024 * 7 / 8; + + // ZSTD only. + // Enable compression algorithm's checksum feature. + // (https://github.com/facebook/zstd/blob/d857369028d997c92ff1f1861a4d7f679a125464/lib/zstd.h#L428) + // Each compressed frame will have a 32-bit checksum attached. The checksum + // computed from the uncompressed data and can be verified during + // decompression. + bool checksum = false; + + // A convenience function for setting max_compressed_bytes_per_kb based on a + // minimum acceptable compression ratio (uncompressed size over compressed + // size). + void SetMinRatio(double min_ratio) { + max_compressed_bytes_per_kb = static_cast(1024.0 / min_ratio + 0.5); + } +}; + } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/concurrent_task_limiter.h b/include/rocksdb/concurrent_task_limiter.h index 9ad741f98d5..27ede4bc420 100644 --- a/include/rocksdb/concurrent_task_limiter.h +++ b/include/rocksdb/concurrent_task_limiter.h @@ -45,7 +45,7 @@ class ConcurrentTaskLimiter { // @param limit: max concurrent tasks. // limit = 0 means no new task allowed. // limit < 0 means no limitation. -extern ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name, - int32_t limit); +ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name, + int32_t limit); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 0620172151e..1bc037e38a2 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -131,16 +131,31 @@ struct IngestExternalFileArg { IngestExternalFileOptions options; std::vector files_checksums; std::vector files_checksum_func_names; + // A hint as to the temperature for *reading* the files to be ingested. Temperature file_temperature = Temperature::kUnknown; }; struct GetMergeOperandsOptions { + using ContinueCallback = std::function; + // A limit on the number of merge operands returned by the GetMergeOperands() // API. In contrast with ReadOptions::merge_operator_max_count, this is a hard // limit: when it is exceeded, no merge operands will be returned and the // query will fail with an Incomplete status. See also the // DB::GetMergeOperands() API below. int expected_max_number_of_operands = 0; + + // `continue_cb` will be called after reading each merge operand, excluding + // any base value. Operands are read in order from newest to oldest. The + // operand value is provided as an argument. + // + // Returning false will end the lookup process at the merge operand on which + // `continue_cb` was just invoked. Returning true allows the lookup to + // continue. + // + // If it is nullptr, `GetMergeOperands()` will behave as if it always returned + // true (continue fetching merge operands until there are no more). + ContinueCallback continue_cb; }; // A collections of table properties objects, where @@ -549,53 +564,69 @@ class DB { // any, or an empty value otherwise. // // If timestamp is enabled and a non-null timestamp pointer is passed in, - // timestamp is returned. + // timestamp is returned. If the underlying DB implementation doesn't + // support returning timestamp and the timestamp argument is non-null, + // a Status::NotSupported() error will be returned. // // Returns OK on success. Returns NotFound and an empty value in "*value" if // there is no entry for "key". Returns some other non-OK status on error. + // NOTE: Pure virtual => was virtual before + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) = 0; + + // The timestamp of the key is returned if a non-null timestamp pointer is + // passed, and value is returned as a string + // NOTE: virtual final => disallow override (was previously allowed) virtual inline Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, - std::string* value) { + std::string* value, std::string* timestamp) final { assert(value != nullptr); PinnableSlice pinnable_val(value); assert(!pinnable_val.IsPinned()); - auto s = Get(options, column_family, key, &pinnable_val); + auto s = Get(options, column_family, key, &pinnable_val, timestamp); if (s.ok() && pinnable_val.IsPinned()) { value->assign(pinnable_val.data(), pinnable_val.size()); } // else value is already assigned return s; } + + // No timestamp, and value is returned in a PinnableSlice + // NOTE: virtual final => disallow override (was previously allowed) virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) = 0; - virtual Status Get(const ReadOptions& options, const Slice& key, - std::string* value) { - return Get(options, DefaultColumnFamily(), key, value); + PinnableSlice* value) final { + return Get(options, column_family, key, value, nullptr); } - // Get() methods that return timestamp. Derived DB classes don't need to worry - // about this group of methods if they don't care about timestamp feature. + // No timestamp, and the value is returned as a string + // NOTE: virtual final => disallow override (was previously allowed) virtual inline Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, - std::string* value, std::string* timestamp) { + std::string* value) final { assert(value != nullptr); PinnableSlice pinnable_val(value); assert(!pinnable_val.IsPinned()); - auto s = Get(options, column_family, key, &pinnable_val, timestamp); + auto s = Get(options, column_family, key, &pinnable_val); if (s.ok() && pinnable_val.IsPinned()) { value->assign(pinnable_val.data(), pinnable_val.size()); } // else value is already assigned return s; } - virtual Status Get(const ReadOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/, PinnableSlice* /*value*/, - std::string* /*timestamp*/) { - return Status::NotSupported( - "Get() that returns timestamp is not implemented."); + + // Gets a key in the default column family, returns the value as a string, + // and no timestamp returned + // NOTE: virtual final => disallow override (was previously allowed) + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) final { + return Get(options, DefaultColumnFamily(), key, value); } + + // Gets a key in the default column family, returns the value as a string, + // and timestamp of the key is returned if timestamp parameter is non-null + // NOTE: virtual final => disallow override (was previously allowed) virtual Status Get(const ReadOptions& options, const Slice& key, - std::string* value, std::string* timestamp) { + std::string* value, std::string* timestamp) final { return Get(options, DefaultColumnFamily(), key, value, timestamp); } @@ -626,11 +657,12 @@ class DB { } // Populates the `merge_operands` array with all the merge operands in the DB - // for `key`. The `merge_operands` array will be populated in the order of - // insertion. The number of entries populated in `merge_operands` will be - // assigned to `*number_of_operands`. + // for `key`, or a customizable suffix of merge operands when + // `GetMergeOperandsOptions::continue_cb` is set. The `merge_operands` array + // will be populated in the order of insertion. The number of entries + // populated in `merge_operands` will be assigned to `*number_of_operands`. // - // If the number of merge operands in DB for `key` is greater than + // If the number of merge operands to return for `key` is greater than // `merge_operands_options.expected_max_number_of_operands`, // `merge_operands` is not populated and the return value is // `Status::Incomplete`. In that case, `*number_of_operands` will be assigned @@ -650,9 +682,10 @@ class DB { int* number_of_operands) = 0; // Consistent Get of many keys across column families without the need - // for an explicit snapshot. NOTE: the implementation of this MultiGet API - // does not have the performance benefits of the void-returning MultiGet - // functions. + // for an explicit snapshot. The main difference between this set of + // MultiGet APis and the batched MultiGet APIs that follow are - + // 1. The APIs take std::vector instead of C style array pointers + // 2. Values are returned as std::string rather than PinnableSlice // // If keys[i] does not exist in the database, then the i'th returned // status will be one for which Status::IsNotFound() is true, and @@ -662,34 +695,67 @@ class DB { // // (*values) will always be resized to be the same size as (keys). // Similarly, the number of returned statuses will be the number of keys. + // If timestamps is non-null, the vector pointed to by it will be resized to + // number of keys and filled with timestamps of the keys on return. // Note: keys will not be "de-duplicated". Duplicate keys will return - // duplicate values in order. + // duplicate values in order, and may return different status values + // in case there are errors. + // NOTE: virtual final => disallow override (was previously allowed) + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_families, + const std::vector& keys, std::vector* values, + std::vector* timestamps) final { + size_t num_keys = keys.size(); + std::vector statuses(num_keys); + std::vector pin_values(num_keys); + + values->resize(num_keys); + if (timestamps) { + timestamps->resize(num_keys); + } + MultiGet(options, num_keys, + const_cast(column_families.data()), + keys.data(), pin_values.data(), + timestamps ? timestamps->data() : nullptr, statuses.data(), + /*sorted_input=*/false); + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + (*values)[i].assign(pin_values[i].data(), pin_values[i].size()); + } + } + return statuses; + } + + // No timestamps are returned + // NOTE: virtual final => disallow override (was previously allowed) virtual std::vector MultiGet( const ReadOptions& options, const std::vector& column_family, - const std::vector& keys, std::vector* values) = 0; + const std::vector& keys, std::vector* values) final { + values->resize(keys.size()); + return MultiGet(options, column_family, keys, values, nullptr); + } + + // MultiGet for default column family, no timestamps returned + // NOTE: virtual final => disallow override (was previously allowed) virtual std::vector MultiGet(const ReadOptions& options, const std::vector& keys, - std::vector* values) { + std::vector* values) final { + values->resize(keys.size()); return MultiGet( options, std::vector(keys.size(), DefaultColumnFamily()), keys, values); } + // MultiGet for default column family + // NOTE: virtual final => disallow override (was previously allowed) virtual std::vector MultiGet( - const ReadOptions& /*options*/, - const std::vector& /*column_family*/, - const std::vector& keys, std::vector* /*values*/, - std::vector* /*timestamps*/) { - return std::vector( - keys.size(), Status::NotSupported( - "MultiGet() returning timestamps not implemented.")); - } - virtual std::vector MultiGet(const ReadOptions& options, - const std::vector& keys, - std::vector* values, - std::vector* timestamps) { + const ReadOptions& options, const std::vector& keys, + std::vector* values, + std::vector* timestamps) final { + values->resize(keys.size()); return MultiGet( options, std::vector(keys.size(), DefaultColumnFamily()), @@ -704,123 +770,59 @@ class DB { // benefits. // Parameters - // options - ReadOptions - // column_family - ColumnFamilyHandle* that the keys belong to. All the keys - // passed to the API are restricted to a single column family // num_keys - Number of keys to lookup + // column_families - Pointer to C style array of ColumnFamilyHandle* that + // the keys belong to. // keys - Pointer to C style array of key Slices with num_keys elements // values - Pointer to C style array of PinnableSlices with num_keys elements + // timestamps - Pointer to C style array of std::string that, if non-null and + // timestamps are enabled, will be filled with timestamps of the + // keys on return. The array should be sized to num_keys entries + // by the caller. // statuses - Pointer to C style array of Status with num_keys elements // sorted_input - If true, it means the input keys are already sorted by key // order, so the MultiGet() API doesn't have to sort them // again. If false, the keys will be copied and sorted // internally by the API - the input array will not be // modified - virtual void MultiGet(const ReadOptions& options, - ColumnFamilyHandle* column_family, - const size_t num_keys, const Slice* keys, - PinnableSlice* values, Status* statuses, - const bool /*sorted_input*/ = false) { - std::vector cf; - std::vector user_keys; - std::vector status; - std::vector vals; - for (size_t i = 0; i < num_keys; ++i) { - cf.emplace_back(column_family); - user_keys.emplace_back(keys[i]); - } - status = MultiGet(options, cf, user_keys, &vals); - std::copy(status.begin(), status.end(), statuses); - for (auto& value : vals) { - values->PinSelf(value); - values++; - } - } + // NOTE: Pure virtual => was virtual (optional). If the concrete + // implementation + // doesn't support returning timestamps, and the timestamps paramater is + // non-null, it should return Status::NotSupported() for all the keys. + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool sorted_input = false) = 0; + // MultiGet for single column family + // NOTE: virtual final => disallow override (was previously allowed) virtual void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, PinnableSlice* values, std::string* timestamps, - Status* statuses, const bool /*sorted_input*/ = false) { - std::vector cf; - std::vector user_keys; - std::vector status; - std::vector vals; - std::vector tss; + Status* statuses, + const bool sorted_input = false) final; - for (size_t i = 0; i < num_keys; ++i) { - cf.emplace_back(column_family); - user_keys.emplace_back(keys[i]); - } - status = MultiGet(options, cf, user_keys, &vals, &tss); - std::copy(status.begin(), status.end(), statuses); - std::copy(tss.begin(), tss.end(), timestamps); - for (auto& value : vals) { - values->PinSelf(value); - values++; - } + // MultiGet for single column family, no timestamps returned + // NOTE: virtual final => disallow override (was previously allowed) + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) final { + MultiGet(options, column_family, num_keys, keys, values, nullptr, statuses, + sorted_input); } - // Overloaded MultiGet API that improves performance by batching operations - // in the read path for greater efficiency. Currently, only the block based - // table format with full filters are supported. Other table formats such - // as plain table, block based table with block based filters and - // partitioned indexes will still work, but will not get any performance - // benefits. - // Parameters - - // options - ReadOptions - // column_family - ColumnFamilyHandle* that the keys belong to. All the keys - // passed to the API are restricted to a single column family - // num_keys - Number of keys to lookup - // keys - Pointer to C style array of key Slices with num_keys elements - // values - Pointer to C style array of PinnableSlices with num_keys elements - // statuses - Pointer to C style array of Status with num_keys elements - // sorted_input - If true, it means the input keys are already sorted by key - // order, so the MultiGet() API doesn't have to sort them - // again. If false, the keys will be copied and sorted - // internally by the API - the input array will not be - // modified + // Multiple column families, no timestamps returned + // NOTE: virtual final => disallow override (was previously allowed) virtual void MultiGet(const ReadOptions& options, const size_t num_keys, ColumnFamilyHandle** column_families, const Slice* keys, PinnableSlice* values, Status* statuses, - const bool /*sorted_input*/ = false) { - std::vector cf; - std::vector user_keys; - std::vector status; - std::vector vals; - - for (size_t i = 0; i < num_keys; ++i) { - cf.emplace_back(column_families[i]); - user_keys.emplace_back(keys[i]); - } - status = MultiGet(options, cf, user_keys, &vals); - std::copy(status.begin(), status.end(), statuses); - for (auto& value : vals) { - values->PinSelf(value); - values++; - } - } - virtual void MultiGet(const ReadOptions& options, const size_t num_keys, - ColumnFamilyHandle** column_families, const Slice* keys, - PinnableSlice* values, std::string* timestamps, - Status* statuses, const bool /*sorted_input*/ = false) { - std::vector cf; - std::vector user_keys; - std::vector status; - std::vector vals; - std::vector tss; - - for (size_t i = 0; i < num_keys; ++i) { - cf.emplace_back(column_families[i]); - user_keys.emplace_back(keys[i]); - } - status = MultiGet(options, cf, user_keys, &vals, &tss); - std::copy(status.begin(), status.end(), statuses); - std::copy(tss.begin(), tss.end(), timestamps); - for (auto& value : vals) { - values->PinSelf(value); - values++; - } + const bool sorted_input = false) final { + MultiGet(options, num_keys, column_families, keys, values, nullptr, + statuses, sorted_input); } // Batched MultiGet-like API that returns wide-column entities from a single @@ -969,9 +971,19 @@ class DB { const std::vector& column_families, std::vector* iterators) = 0; + // Returns the iterator sequence number virtual SequenceNumber GetIteratorSequenceNumber(Iterator* it) = 0; + // UNDER CONSTRUCTION - DO NOT USE + // Return a cross-column-family iterator from a consistent database state. + // When the same key is present in multiple column families, the iterator + // selects the value or columns from the first column family containing the + // key, in the order specified by the `column_families` parameter. + virtual std::unique_ptr NewMultiCfIterator( + const ReadOptions& options, + const std::vector& column_families) = 0; + // Return a handle to the current DB state. Iterators created with // this handle will all observe a stable snapshot of the current DB // state. The caller must call ReleaseSnapshot(result) when the @@ -1740,11 +1752,6 @@ class DB { // The sequence number of the most recent transaction. virtual SequenceNumber GetLatestSequenceNumber() const = 0; - // Prevent file deletions. Compactions will continue to occur, - // but no obsolete files will be deleted. Calling this multiple - // times have the same effect as calling it once. - virtual Status DisableFileDeletions() = 0; - // Increase the full_history_ts of column family. The new ts_low value should // be newer than current full_history_ts value. // If another thread updates full_history_ts_low concurrently to a higher @@ -1756,26 +1763,24 @@ class DB { virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, std::string* ts_low) = 0; - // Enable deleting obsolete files. - // Usually users should only need to call this if they have previously called - // `DisableFileDeletions`. + // Suspend deleting obsolete files. Compactions will continue to occur, + // but no obsolete files will be deleted. To resume file deletions, each + // call to DisableFileDeletions() must be matched by a subsequent call to + // EnableFileDeletions(). For more details, see EnableFileDeletions(). + virtual Status DisableFileDeletions() = 0; + + // Resume deleting obsolete files, following up on `DisableFileDeletions()`. + // // File deletions disabling and enabling is not controlled by a binary flag, // instead it's represented as a counter to allow different callers to // independently disable file deletion. Disabling file deletion can be // critical for operations like making a backup. So the counter implementation // makes the file deletion disabled as long as there is one caller requesting // so, and only when every caller agrees to re-enable file deletion, it will - // be enabled. So be careful when calling this function with force = true as - // explained below. - // If force == true, the call to EnableFileDeletions() will guarantee that - // file deletions are enabled after the call, even if DisableFileDeletions() - // was called multiple times before. - // If force == false, EnableFileDeletions will only enable file deletion - // after it's been called at least as many times as DisableFileDeletions(), - // enabling the two methods to be called by two threads concurrently without + // be enabled. Two threads can call this method concurrently without // synchronization -- i.e., file deletions will be enabled only after both // threads call EnableFileDeletions() - virtual Status EnableFileDeletions(bool force) = 0; + virtual Status EnableFileDeletions() = 0; // Retrieves the creation time of the oldest file in the DB. // This API only works if max_open_files = -1, if it is not then @@ -1901,6 +1906,14 @@ class DB { // to Flush the memtable first before ingesting the file. // In the second mode we will always ingest in the bottom most level (see // docs to IngestExternalFileOptions::ingest_behind). + // For a column family that enables user-defined timestamps, ingesting + // external SST files are supported with these limitations: 1) Ingested file's + // user key (without timestamp) range should not overlap with the db's key + // range. 2) When ingesting multiple external SST files, their key ranges + // should not overlap with each other either. 3) Ingestion behind mode is not + // supported. 4) When an ingested file contains point data and range deletion + // for the same key, the point data currently overrides the range deletion + // regardless which one has the higher user-defined timestamps. // // (1) External SST files can be created using SstFileWriter // (2) We will try to ingest the files to the lowest possible level @@ -1913,6 +1926,16 @@ class DB { // the files cannot be ingested to the bottommost level, and it is the // user's responsibility to clear the bottommost level in the overlapping // range before re-attempting the ingestion. + // + // EXPERIMENTAL: the temperatures of the files after ingestion are currently + // determined like this: + // - If the ingested file is moved rather than copied, its temperature is + // inherited from the input file. + // - If either ingest_behind or fail_if_not_bottommost_level is set to true, + // then the temperature is set to the CF's last_level_temperature. + // - Otherwise, the temperature is set to the CF's default_write_temperature. + // (Landing in the last level does not currently guarantee using + // last_level_temperature - TODO) virtual Status IngestExternalFile( ColumnFamilyHandle* column_family, const std::vector& external_files, diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 7b0220635ec..d81960c437f 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -67,6 +67,7 @@ struct ThreadStatus; class FileSystem; class SystemClock; struct ConfigOptions; +struct IOOptions; const size_t kDefaultPageSize = 4 * 1024; @@ -1001,7 +1002,7 @@ class WritableFile { /* * Get the size of valid data in the file. */ - virtual uint64_t GetFileSize() { return 0; } + virtual uint64_t GetFileSize() = 0; /* * Get and set the default pre-allocation block size for writes to @@ -1206,7 +1207,7 @@ enum InfoLogLevel : unsigned char { // including data loss, unreported corruption, deadlocks, and more. class Logger { public: - size_t kDoNotSupportGetLogFileSize = (std::numeric_limits::max)(); + static constexpr size_t kDoNotSupportGetLogFileSize = SIZE_MAX; explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) : closed_(false), log_level_(log_level) {} @@ -1301,62 +1302,60 @@ class DynamicLibrary { virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0; }; -extern void LogFlush(const std::shared_ptr& info_log); +void LogFlush(const std::shared_ptr& info_log); -extern void Log(const InfoLogLevel log_level, - const std::shared_ptr& info_log, const char* format, - ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4); +void Log(const InfoLogLevel log_level, const std::shared_ptr& info_log, + const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4); // a set of log functions with different log levels. -extern void Header(const std::shared_ptr& info_log, const char* format, - ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); -extern void Debug(const std::shared_ptr& info_log, const char* format, - ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); -extern void Info(const std::shared_ptr& info_log, const char* format, - ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); -extern void Warn(const std::shared_ptr& info_log, const char* format, - ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); -extern void Error(const std::shared_ptr& info_log, const char* format, - ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); -extern void Fatal(const std::shared_ptr& info_log, const char* format, - ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +void Header(const std::shared_ptr& info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +void Debug(const std::shared_ptr& info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +void Info(const std::shared_ptr& info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +void Warn(const std::shared_ptr& info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +void Error(const std::shared_ptr& info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +void Fatal(const std::shared_ptr& info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); // Log the specified data to *info_log if info_log is non-nullptr. // The default info log level is InfoLogLevel::INFO_LEVEL. -extern void Log(const std::shared_ptr& info_log, const char* format, - ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +void Log(const std::shared_ptr& info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); -extern void LogFlush(Logger* info_log); +void LogFlush(Logger* info_log); -extern void Log(const InfoLogLevel log_level, Logger* info_log, - const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4); +void Log(const InfoLogLevel log_level, Logger* info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4); // The default info log level is InfoLogLevel::INFO_LEVEL. -extern void Log(Logger* info_log, const char* format, ...) +void Log(Logger* info_log, const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); // a set of log functions with different log levels. -extern void Header(Logger* info_log, const char* format, ...) +void Header(Logger* info_log, const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); -extern void Debug(Logger* info_log, const char* format, ...) +void Debug(Logger* info_log, const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); -extern void Info(Logger* info_log, const char* format, ...) +void Info(Logger* info_log, const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); -extern void Warn(Logger* info_log, const char* format, ...) +void Warn(Logger* info_log, const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); -extern void Error(Logger* info_log, const char* format, ...) +void Error(Logger* info_log, const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); -extern void Fatal(Logger* info_log, const char* format, ...) +void Fatal(Logger* info_log, const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); // A utility routine: write "data" to the named file. -extern Status WriteStringToFile(Env* env, const Slice& data, - const std::string& fname, - bool should_sync = false); +Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname, + bool should_sync = false, + const IOOptions* io_options = nullptr); // A utility routine: read contents of named file into *data -extern Status ReadFileToString(Env* env, const std::string& fname, - std::string* data); +Status ReadFileToString(Env* env, const std::string& fname, std::string* data); // Below are helpers for wrapping most of the classes in this file. // They forward all calls to another instance of the class. diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h index 6feae06811b..a782a1a6e79 100644 --- a/include/rocksdb/env_encryption.h +++ b/include/rocksdb/env_encryption.h @@ -32,7 +32,7 @@ std::shared_ptr NewEncryptedFS( // blocks). E.g. CTR (Counter operation mode) supports this requirement. class BlockAccessCipherStream { public: - virtual ~BlockAccessCipherStream(){}; + virtual ~BlockAccessCipherStream(){} // BlockSize returns the size of each block supported by this cipher stream. virtual size_t BlockSize() = 0; diff --git a/include/rocksdb/experimental.h b/include/rocksdb/experimental.h index b59395255c1..79b05c016e5 100644 --- a/include/rocksdb/experimental.h +++ b/include/rocksdb/experimental.h @@ -5,6 +5,11 @@ #pragma once +#include +#include +#include + +#include "rocksdb/data_structure.h" #include "rocksdb/db.h" #include "rocksdb/status.h" @@ -52,5 +57,431 @@ Status UpdateManifestForFilesState( const std::vector& column_families, const UpdateManifestForFilesStateOptions& opts = {}); +// **************************************************************************** +// EXPERIMENTAL new filtering features +// **************************************************************************** + +// A class for splitting a key into meaningful pieces, or "segments" for +// filtering purposes. Keys can also be put in "categories" to simplify +// some configuration and handling. To simplify satisfying some filtering +// requirements, the segments must encompass a complete key prefix (or the whole +// key) and segments cannot overlap. +// +// Once in production, the behavior associated with a particular Name() +// cannot change. Introduce a new Name() when introducing new behaviors. +// See also SstQueryFilterConfigsManager below. +// +// OTHER CURRENT LIMITATIONS (maybe relaxed in the future for segments only +// needing point query or WHERE filtering): +// * Assumes the (default) byte-wise comparator is used. +// * Assumes the category contiguousness property: that each category is +// contiguous in comparator order. In other words, any key between two keys of +// category c must also be in category c. +// * Assumes the (weak) segment ordering property (described below) always +// holds. (For byte-wise comparator, this is implied by the segment prefix +// property, also described below.) +// * Not yet compatible with user timestamp feature +// +// SEGMENT ORDERING PROPERTY: For maximum use in filters, especially for +// filtering key range queries, we must have a correspondence between +// the lexicographic ordering of key segments and the ordering of keys +// they are extracted from. In other words, if we took the segmented keys +// and ordered them primarily by (byte-wise) order on segment 0, then +// on segment 1, etc., then key order of the original keys would not be +// violated. This is the WEAK form of the property, where multiple keys +// might generate the same segments, but such keys must be contiguous in +// key order. (The STRONG form of the property is potentially more useful, +// but for bytewise comparator, it can be inferred from segments satisfying +// the weak property by assuming another segment that extends to the end of +// the key, which would be empty if the segments already extend to the end +// of the key.) +// +// The segment ordering property is hard to think about directly, but for +// bytewise comparator, it is implied by a simpler property to reason about: +// the segment prefix property (see below). (NOTE: an example way to satisfy +// the segment ordering property while breaking the segment prefix property +// is to have a segment delimited by any byte smaller than a certain value, +// and not include the delimiter with the segment leading up to the delimiter. +// For example, the space character is ordered before other printable +// characters, so breaking "foo bar" into "foo", " ", and "bar" would be +// legal, but not recommended.) +// +// SEGMENT PREFIX PROPERTY: If a key generates segments s0, ..., sn (possibly +// more beyond sn) and sn does not extend to the end of the key, then all keys +// starting with bytes s0+...+sn (concatenated) also generate the same segments +// (possibly more). For example, if a key has segment s0 which is less than the +// whole key and another key starts with the bytes of s0--or only has the bytes +// of s0--then the other key must have the same segment s0. In other words, any +// prefix of segments that might not extend to the end of the key must form an +// unambiguous prefix code. See +// https://en.wikipedia.org/wiki/Prefix_code In other other words, parsing +// a key into segments cannot use even a single byte of look-ahead. Upon +// processing each byte, the extractor decides whether to cut a segment that +// ends with that byte, but not one that ends before that byte. The only +// exception is that upon reaching the end of the key, the extractor can choose +// whether to make a segment that ends at the end of the key. +// +// Example types of key segments that can be freely mixed in any order: +// * Some fixed number of bytes or codewords. +// * Ends in a delimiter byte or codeword. (Not including the delimiter as +// part of the segment leading up to it would very likely violate the segment +// prefix property.) +// * Length-encoded sequence of bytes or codewords. The length could even +// come from a preceding segment. +// * Any/all remaining bytes to the end of the key, though this implies all +// subsequent segments will be empty. +// For each kind of segment, it should be determined before parsing the segment +// whether an incomplete/short parse will be treated as a segment extending to +// the end of the key or as an empty segment. +// +// For example, keys might consist of +// * Segment 0: Any sequence of bytes up to and including the first ':' +// character, or the whole key if no ':' is present. +// * Segment 1: The next four bytes, all or nothing (in case of short key). +// * Segment 2: An unsigned byte indicating the number of additional bytes in +// the segment, and then that many bytes (or less up to the end of the key). +// * Segment 3: Any/all remaining bytes in the key +// +// For an example of what can go wrong, consider using '4' as a delimiter +// but not including it with the segment leading up to it. Suppose we have +// these keys and corresponding first segments: +// "123456" -> "123" +// "124536" -> "12" +// "125436" -> "125" +// Notice how byte-wise comparator ordering of the segments does not follow +// the ordering of the keys. This means we cannot safely use a filter with +// a range of segment values for filtering key range queries. +// +// Also note that it is legal for all keys in a category (or many categories) +// to return an empty sequence of segments. +// +// To eliminate a confusing distinction between a segment that is empty vs. +// "not present" for a particular key, each key is logically assiciated with +// an infinite sequence of segments, including some infinite tail of 0-length +// segments. In practice, we only represent a finite sequence that (at least) +// covers the non-trivial segments. +// +class KeySegmentsExtractor { + public: + // The extractor assigns keys to categories so that it is easier to + // combine distinct (though disjoint) key representations within a single + // column family while applying different or overlapping filtering + // configurations to the categories. + // To enable fast set representation, the user is allowed up to 64 + // categories for assigning to keys with the extractor. The user will + // likely cast to their own enum type or scalars. + enum KeyCategory : uint_fast8_t { + kDefaultCategory = 0, + kMinCategory = kDefaultCategory, + // ... (user categories) + // Can be used for keys ordered before typical keys. Not necessarily an + // error. + kReservedLowCategory = 62, + // Can be used for keys ordered after typical keys. Not necessarily an + // error. + kReservedHighCategory = 63, + kMaxUsableCategory = kReservedHighCategory, + + // Signals to the caller that an unexpected input or condition has + // been reached and filter construction should be aborted. + kErrorCategoryFilterScope = UINT8_MAX - 2, + kMinErrorCategory = kErrorCategoryFilterScope, + // Signals to the caller that an unexpected input or condition has + // been reached and SST construction (and compaction or flush) + // should be aborted. + kErrorCategoryFileScope = UINT8_MAX - 1, + // Signals to the caller that an unexpected input or condition has + // been reached and the DB should be considered to have reached an + // invalid state, at least in memory. + kErrorCategoryDbScope = UINT8_MAX, + }; + using KeyCategorySet = SmallEnumSet; + + // The extractor can process three kinds of key-like inputs + enum KeyKind { + // User key, not including user timestamp + kFullUserKey, + // An iterator lower bound (inclusive). This should generally be handled + // the same as a full user key but the distinction might be useful for + // diagnostics or assertions. + kInclusiveLowerBound, + // An iterator upper bound (exclusive). Upper bounds are frequently + // constructed by incrementing the last byte of a key prefix, and this can + // affect what should be considered as a segment delimiter. + kExclusiveUpperBound, + }; + + // The extractor result + struct Result { + // Positions in the key (or bound) that represent boundaries + // between segments, or the exclusive end of each segment. For example, if + // the key is "abc|123|xyz" then following the guidance of including + // delimiters with the preceding segment, segment_ends would be {4, 8, 11}, + // representing segments "abc|" "123|" and "xyz". Empty segments are + // naturally represented with repeated values, as in {4, 8, 8} for + // "abc|123|", though {4, 8} would be logically equivalent because an + // infinite sequence of 0-length segments is assumed after what is + // explicitly represented here. However, segments might not reach the end + // the key (no automatic last segment to the end of the key) and that is + // OK for the WEAK ordering property. + // + // The first segment automatically starts at key position 0. The only way + // to put gaps between segments of interest is to assign those gaps to + // numbered segments, which can be left unused. + std::vector segment_ends; + + // A category to assign to the key or bound. This default may be kept, + // such as to put all keys into a single category. + // IMPORTANT CURRENT LIMITATION from above: each category must be + // contiguous in key comparator order, so any key between two keys in + // category c must also be in category c. (Typically the category will be + // determined by segment 0 in some way, often the first byte.) The enum + // scalar values do not need to be related to key order. + KeyCategory category = kDefaultCategory; + }; + + virtual ~KeySegmentsExtractor() {} + + // A class name for this extractor. See also expectations in GetId(). + virtual const char* Name() const = 0; + + // An identifying string that is permanently associated with the behavior + // of this extractor. If a behavior change is made or set in the constructor, + // the id must change to avoid incorrect filtering behavior on DBs using a + // previous version of the extractor. + virtual std::string GetId() const { + // The default implementation assumes no configuration variance in the + // constructor and just returns the class name. + return Name(); + } + + // Populates the extraction result and returns OK. Error can be signaled + // with `kError` pseudo-categories. This function is expected to generate + // non-error results (though possibly empty) on all keys or bounds expected + // to be encountered by the DB. RocksDB will always call the function with + // a (pointer to a) default-initialized result object. + virtual void Extract(const Slice& key_or_bound, KeyKind kind, + Result* result) const = 0; +}; + +// Alternatives for filtering inputs + +// An individual key segment. +struct SelectKeySegment { + // Segments are numbered starting from 0. + explicit SelectKeySegment(uint16_t _segment_index) + : segment_index(_segment_index) {} + uint16_t segment_index; +}; + +// A range of key segments concatenated together. No concatenation operations +// are needed, however, because with no gaps between segments, a range of +// segments is a simple substring of the key. +struct SelectKeySegmentRange { + // Segments are numbered starting from 0. Range is inclusive. + explicit SelectKeySegmentRange(uint8_t _from_segment_index, + uint8_t _to_segment_index) + : from_segment_index(_from_segment_index), + to_segment_index(_to_segment_index) {} + // Inclusive + uint8_t from_segment_index; + // Inclusive + uint8_t to_segment_index; +}; + +// User key without timestamp +struct SelectWholeKey {}; + +// TODO: The remaining Select* are not yet supported +// As generated by prefix_extractor +struct SelectLegacyKeyPrefix {}; + +struct SelectUserTimestamp {}; + +struct SelectColumnName {}; + +struct SelectValue {}; + +// Note: more variants might be added in the future. +using FilterInput = + std::variant; + +// Base class for individual filtering schemes in terms of chosen +// FilterInputs, but not tied to a particular KeySegmentsExtractor. +// +// Not user extensible, name sometimes shortened to SQFC +class SstQueryFilterConfig { + public: + virtual ~SstQueryFilterConfig() {} +}; + +// A filtering scheme that stores minimum and maximum values (according +// to bytewise ordering) of the specified filter input. Because the +// empty string is often a special case, the filter excludes that from the +// min/max computation and keeps a separate boolean for whether empty is +// present. +// +// The filter is also limited to the specified categories, ignoring entries +// outside the given set of categories. If not All, ranges can only be +// filtered if upper and lower bounds are in the same category (and that +// category is in the set relevant to the filter). +std::shared_ptr MakeSharedBytewiseMinMaxSQFC( + FilterInput select, KeySegmentsExtractor::KeyCategorySet categories = + KeySegmentsExtractor::KeyCategorySet::All()); + +// TODO: more kinds of filters, eventually including Bloom/ribbon filters +// and replacing the old filter configuration APIs + +// Represents a complete strategy for representing filters in SST files +// and applying them to optimize range and point queries by excluding +// irrelevant SST files (as best we can). This is a set of filtering +// schemes and a KeySegmentsExtractor. For performance, a single extractor +// should be implemented to meet all the filtering needs of any given +// column family. KeySegmentsExtractor and FilterInput should be flexible +// enough that there is no loss of generality, e.g. with leaving segments +// blank and using segment ranges. +struct SstQueryFilterConfigs { + std::vector> filters; + std::shared_ptr extractor; + + // Whether this object represent an empty set of configs because no + // applicable configurations were found. (This case is represented by + // an internal singleton instance.) + bool IsEmptyNotFound() const; +}; + +// SstQueryFilterConfigsManager provides facilities for safe and effective +// filtering version management, with simple dynamic upgrade/downgrade +// support. It is designed to encourage a development pattern that +// minimizes the risk of filter and extractor versioning bugs. +// +// SstQueryFilterConfigsManager is essentially an immutable mapping +// from {config_name_string, version_number} -> SstQueryFilterConfigs +// for some contiguous range of version numbers. It is also a starting +// point for specifying which configuration should be used, with awareness +// of other configurations that might already be persisted in SST files +// or switched to dynamically. +// +// Background: a single codebase might have many use cases and for +// each use case, a sequence of past, current, and future filtering +// configurations. It is common for future configurations to be +// implemented before automatically deployed in order to ensure that +// a DB can be effectively opened and operated on by a recent older code +// version. And it is common to maintain a reasonable history of past +// configurations to ensure smooth upgrade path and proper handling of +// older SST files that haven't been compacted recently. +// +// It would be possible to make SstQueryFilterConfigs dynamically +// configurable through strings, but that would encourage deployment +// of ad-hoc, under-tested configurations. +// +// Solution: the {config_name_string, version_number} -> SstQueryFilterConfigs +// mapping in SstQueryFilterConfigsManager formalizes the history (past and +// future) of approved/tested configurations for a given use case. Filter +// configurations are kept paired with extractors that they make sense with. +// +// The version numbers are "global" to the SstQueryFilterConfigsManager so +// that it is simple to determine whether a particular code version supports +// a particular filtering version, regardless of which use case. Numbering +// always starts with 1, as 0 is reserved for selecting a "no filters" +// configuration +// +// Consider an example initialized with this Data: +// +// SstQueryFilterConfigsManager::Data data = { +// {1, {{"foo", foo_configs}}}, +// {2, {{"bar", bar_configs}, +// {"baz", baz_configs}}}, +// {3, {{"bar", bar_configs_v2}}}, +// }; +// +// For example, MakeSharedFactory(..., "baz", 1) will use a default empty +// config, while both MakeSharedFactory(..., "baz", 2) and +// MakeSharedFactory(..., "baz", 3) select `baz_configs`. Selecting version +// >= 4 is rejected because those configurations are not known to this +// version of the code. +// +// For correct operation, existing versions should be treated as immutable +// (once committed to where they could enter production). For example, an +// update to "baz" should be done in a new version (4), not by amending to +// version 3. Note also (from before) that the behavior of named extractors +// must not change, so changes to key segment extraction should introduce +// new named extractors while keeping the old in the older configs. +// +// It is possible to eventually remove the oldest versions, as long as +// * You won't be rolling back to that version. +// * You don't have any SST files using an extractor that is only available +// in that version (and prior). +// * For each use case and version you might roll back to, you aren't removing +// the configuration in effect for that version. In our example above, we +// cannot simply remove version 1 because that would change the configuration +// of "foo" at version 2. Moving {"foo", foo_configs} to version 2 would be +// an acceptable work-around for retiring version 1. +// * There are no gaps in the version numbers specified. Even if you completely +// retire a use case and want to remove relevant code, you still need to keep +// an explicit mapping, even if it's empty, as in `{3, {}}` if retiring "bar". +// +// Internally, the SstQueryFilterConfigsManager greatly simplifies lifetime +// management for relevant objects such as KeySegmentExtractors, and provides +// a lighter weight (and less troublesome) mechanism for relevant named object +// look-up vs. ObjectRegistry. If following the guidelines above, any extractor +// referenced in a read SST file should also be referenced by the +// SstQueryFilterConfigsManager. +// +// Not user extensible +class SstQueryFilterConfigsManager + : public std::enable_shared_from_this { + public: + using FilteringVersion = uint32_t; + using NamedConfigs = std::pair; + using Data = + std::vector>>; + + static Status MakeShared(const Data& data, + std::shared_ptr* out); + + virtual ~SstQueryFilterConfigsManager() {} + + // EXPERIMENTAL/TEMPORARY: hook into table properties for persisting + // filters and table_filter for applying to range queries. + + class Factory : public TablePropertiesCollectorFactory { + public: + // Modify the target filtering version for new filters. Returns + // non-OK if the version is not supported. Thread-safe. + virtual Status SetFilteringVersion(FilteringVersion ver) = 0; + virtual FilteringVersion GetFilteringVersion() const = 0; + + // The configs_name used to create this Factory. Immutable. + virtual const std::string& GetConfigsName() const = 0; + + // The relevant configs from the SstQueryFilterConfigsManager for + // the ConfigsName and FilteringVersion. + virtual const SstQueryFilterConfigs& GetConfigs() const = 0; + + // The buffers pointed to by the Slices must live as long as any read + // operations using this table filter function. + // Can read and process any filters created under this + // SstQueryFilterConfigsManager but is most efficient when using the + // same KeySegmentExtractor as this Factory's configs. + // (That performance optimization is the only reason this function is here + // rather than in SstQueryFilterConfigsManager.) + virtual std::function + GetTableFilterForRangeQuery(Slice lower_bound_incl, + Slice upper_bound_excl) const = 0; + }; + + // Returns OK and creates a Factory as long as `ver` is in the + // supported range or 0 (always empty/not found). If the particular + // config_name is not found under that version, then + // factory->GetConfigs().IsEmptyNotFound() will be true. Such a factory can + // read filters but will not write any filters. + virtual Status MakeSharedFactory(const std::string& configs_name, + FilteringVersion ver, + std::shared_ptr* out) const = 0; +}; + } // namespace experimental } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/file_checksum.h b/include/rocksdb/file_checksum.h index 758bae4acf2..66024d0a1b4 100644 --- a/include/rocksdb/file_checksum.h +++ b/include/rocksdb/file_checksum.h @@ -131,7 +131,7 @@ class FileChecksumList { }; // Create a new file checksum list. -extern FileChecksumList* NewFileChecksumList(); +FileChecksumList* NewFileChecksumList(); // Return a shared_ptr of the builtin Crc32c based file checksum generator // factory object, which can be shared to create the Crc32c based checksum diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 578b83de12c..1748b7c6393 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -82,7 +82,14 @@ enum class IOType : uint8_t { // enum representing various operations supported by underlying FileSystem. // These need to be set in SupportedOps API for RocksDB to use them. -enum FSSupportedOps { kAsyncIO, kFSBuffer }; +enum FSSupportedOps { + kAsyncIO, // Supports async reads + kFSBuffer, // Supports handing off the file system allocated read buffer + // to the caller of Read/MultiRead + kVerifyAndReconstructRead, // Supports a higher level of data integrity. See + // the verify_and_reconstruct_read flag in + // IOOptions. +}; // Per-request options that can be passed down to the FileSystem // implementation. These are hints and are not necessarily guaranteed to be @@ -121,6 +128,18 @@ struct IOOptions { // directories and list only files in GetChildren API. bool do_not_recurse; + // Setting this flag indicates a corruption was detected by a previous read, + // so the caller wants to re-read the data with much stronger data integrity + // checking and correction, i.e requests the file system to reconstruct the + // data from redundant copies and verify checksums, if available, in order + // to have a better chance of success. It is expected that this will have a + // much higher overhead than a normal read. + // This is a hint. At a minimum, the file system should implement this flag in + // FSRandomAccessFile::Read and FSSequentialFile::Read + // NOTE: The file system must support kVerifyAndReconstructRead in + // FSSupportedOps, otherwise this feature will not be used. + bool verify_and_reconstruct_read; + // EXPERIMENTAL Env::IOActivity io_activity = Env::IOActivity::kUnknown; @@ -132,7 +151,8 @@ struct IOOptions { rate_limiter_priority(Env::IO_TOTAL), type(IOType::kUnknown), force_dir_fsync(force_dir_fsync_), - do_not_recurse(false) {} + do_not_recurse(false), + verify_and_reconstruct_read(false) {} }; struct DirFsyncOptions { @@ -770,6 +790,8 @@ class FSSequentialFile { // SequentialFileWrapper too. }; +using FSAllocationPtr = std::unique_ptr>; + // A read IO request structure for use in MultiRead and asynchronous Read APIs. struct FSReadRequest { // Input parameter that represents the file offset in bytes. @@ -836,7 +858,7 @@ struct FSReadRequest { // - FSReadRequest::result should point to fs_scratch. // - This is needed only if FSSupportedOps::kFSBuffer support is provided by // underlying FS. - std::unique_ptr> fs_scratch; + FSAllocationPtr fs_scratch; }; // A file abstraction for randomly reading the contents of a file. @@ -910,7 +932,7 @@ class FSRandomAccessFile { virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { return 0; // Default implementation to prevent issues with backwards // compatibility. - }; + } enum AccessPattern { kNormal, kRandom, kSequential, kWillNeed, kWontNeed }; @@ -963,10 +985,10 @@ class FSRandomAccessFile { // AbortIO API. // // Default implementation is to read the data synchronously. - virtual IOStatus ReadAsync( - FSReadRequest& req, const IOOptions& opts, - std::function cb, void* cb_arg, - void** /*io_handle*/, IOHandleDeleter* /*del_fn*/, IODebugContext* dbg) { + virtual IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, + std::function cb, + void* cb_arg, void** /*io_handle*/, + IOHandleDeleter* /*del_fn*/, IODebugContext* dbg) { req.status = Read(req.offset, req.len, opts, &(req.result), req.scratch, dbg); cb(req, cb_arg); @@ -1197,9 +1219,7 @@ class FSWritableFile { * Get the size of valid data in the file. */ virtual uint64_t GetFileSize(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) { - return 0; - } + IODebugContext* /*dbg*/) = 0; /* * Get and set the default pre-allocation block size for writes to @@ -1648,16 +1668,16 @@ class FileSystemWrapper : public FileSystem { std::string SerializeOptions(const ConfigOptions& config_options, const std::string& header) const override; - virtual IOStatus Poll(std::vector& io_handles, - size_t min_completions) override { + IOStatus Poll(std::vector& io_handles, + size_t min_completions) override { return target_->Poll(io_handles, min_completions); } - virtual IOStatus AbortIO(std::vector& io_handles) override { + IOStatus AbortIO(std::vector& io_handles) override { return target_->AbortIO(io_handles); } - virtual void SupportedOps(int64_t& supported_ops) override { + void SupportedOps(int64_t& supported_ops) override { return target_->SupportedOps(supported_ops); } @@ -1732,7 +1752,7 @@ class FSRandomAccessFileWrapper : public FSRandomAccessFile { } size_t GetUniqueId(char* id, size_t max_size) const override { return target_->GetUniqueId(id, max_size); - }; + } void Hint(AccessPattern pattern) override { target_->Hint(pattern); } bool use_direct_io() const override { return target_->use_direct_io(); } size_t GetRequiredBufferAlignment() const override { @@ -1742,7 +1762,7 @@ class FSRandomAccessFileWrapper : public FSRandomAccessFile { return target_->InvalidateCache(offset, length); } IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, - std::function cb, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override { return target()->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, dbg); @@ -1979,12 +1999,12 @@ class FSDirectoryWrapper : public FSDirectory { }; // A utility routine: write "data" to the named file. -extern IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, - const std::string& fname, - bool should_sync = false); +IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, + const std::string& fname, bool should_sync = false, + const IOOptions& io_options = IOOptions()); // A utility routine: read contents of named file into *data -extern IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, - std::string* data); +IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, + std::string* data); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h index 8568dd2588c..0cddf4a3346 100644 --- a/include/rocksdb/iterator.h +++ b/include/rocksdb/iterator.h @@ -112,6 +112,9 @@ class Iterator : public Cleanable { // Regardless of whether the iterator was created/refreshed previously // with or without a snapshot, the iterator will be reading the // latest DB state after this call. + // Note that you will need to call a Seek*() function to get the iterator + // back into a valid state before calling a function that assumes the + // state is already valid, like Next(). virtual Status Refresh() { return Refresh(nullptr); } // Similar to Refresh() but the iterator will be reading the latest DB state @@ -134,15 +137,18 @@ class Iterator : public Cleanable { // Get the user-key portion of the internal key at which the iteration // stopped. // Property "rocksdb.iterator.write-time": - // DO NOT USE, UNDER CONSTRUCTION // Get the unix time of the best estimate of the write time of the entry. // Returned as 64-bit raw value (8 bytes). It can be converted to uint64_t // with util method `DecodeU64Ts`. The accuracy of the write time depends on - // settings like preserve_internal_time_seconds. If this feature is - // disabled, this property will always be empty. The actual write time of + // settings like preserve_internal_time_seconds. The actual write time of // the entry should be the same or newer than the returned write time. So // this property can be interpreted as the possible oldest write time for // the entry. + // If the seqno to time mapping recording is not enabled, + // std::numeric_limits::max() will be returned to indicate the + // write time is unknown. For data entry whose sequence number has + // been zeroed out (possible when they reach the last level), 0 is returned + // no matter whether the seqno to time recording feature is enabled or not. virtual Status GetProperty(std::string prop_name, std::string* prop); virtual Slice timestamp() const { @@ -152,9 +158,9 @@ class Iterator : public Cleanable { }; // Return an empty iterator (yields nothing). -extern Iterator* NewEmptyIterator(); +Iterator* NewEmptyIterator(); // Return an empty iterator with the specified status. -extern Iterator* NewErrorIterator(const Status& status); +Iterator* NewErrorIterator(const Status& status); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index 2cc30d871a4..452ae54cdfb 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -251,7 +251,8 @@ enum class FileOperationType { kRangeSync, kAppend, kPositionedAppend, - kOpen + kOpen, + kVerify }; struct FileOperationInfo { diff --git a/include/rocksdb/memory_allocator.h b/include/rocksdb/memory_allocator.h index dc744d7d1a3..d3dfdfdcd50 100644 --- a/include/rocksdb/memory_allocator.h +++ b/include/rocksdb/memory_allocator.h @@ -80,7 +80,7 @@ struct JemallocAllocatorOptions { // (thread-local cache) is enabled to cache unused allocations for future use. // The tcache normally incurs 0.5M extra memory usage per-thread. The usage // can be reduced by limiting allocation sizes to cache. -extern Status NewJemallocNodumpAllocator( +Status NewJemallocNodumpAllocator( const JemallocAllocatorOptions& options, std::shared_ptr* memory_allocator); diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index be0f6cd1f18..d109a542fe3 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -56,7 +56,7 @@ struct DBOptions; using KeyHandle = void*; -extern Slice GetLengthPrefixedSlice(const char* data); +Slice GetLengthPrefixedSlice(const char* data); class MemTableRep { public: @@ -341,15 +341,15 @@ class SkipListFactory : public MemTableRepFactory { // Methods for Configurable/Customizable class overrides static const char* kClassName() { return "SkipListFactory"; } static const char* kNickName() { return "skip_list"; } - virtual const char* Name() const override { return kClassName(); } - virtual const char* NickName() const override { return kNickName(); } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } std::string GetId() const override; // Methods for MemTableRepFactory class overrides using MemTableRepFactory::CreateMemTableRep; - virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, - Allocator*, const SliceTransform*, - Logger* logger) override; + MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, Allocator*, + const SliceTransform*, + Logger* logger) override; bool IsInsertConcurrentlySupported() const override { return true; } @@ -381,9 +381,9 @@ class VectorRepFactory : public MemTableRepFactory { // Methods for MemTableRepFactory class overrides using MemTableRepFactory::CreateMemTableRep; - virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, - Allocator*, const SliceTransform*, - Logger* logger) override; + MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, Allocator*, + const SliceTransform*, + Logger* logger) override; }; // This class contains a fixed array of buckets, each @@ -392,7 +392,7 @@ class VectorRepFactory : public MemTableRepFactory { // skiplist_height: the max height of the skiplist // skiplist_branching_factor: probabilistic size ratio between adjacent // link lists in the skiplist -extern MemTableRepFactory* NewHashSkipListRepFactory( +MemTableRepFactory* NewHashSkipListRepFactory( size_t bucket_count = 1000000, int32_t skiplist_height = 4, int32_t skiplist_branching_factor = 4); @@ -412,7 +412,7 @@ extern MemTableRepFactory* NewHashSkipListRepFactory( // entries when flushing. // @threshold_use_skiplist: a bucket switches to skip list if number of // entries exceed this parameter. -extern MemTableRepFactory* NewHashLinkListRepFactory( +MemTableRepFactory* NewHashLinkListRepFactory( size_t bucket_count = 50000, size_t huge_page_tlb_size = 0, int bucket_entries_logging_threshold = 4096, bool if_log_bucket_dist_when_flash = true, diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h index 6be9e3962b2..387e5345c00 100755 --- a/include/rocksdb/merge_operator.h +++ b/include/rocksdb/merge_operator.h @@ -278,7 +278,7 @@ class MergeOperator : public Customizable { // TODO: the name is currently not stored persistently and thus // no checking is enforced. Client is responsible for providing // consistent MergeOperator between DB opens. - virtual const char* Name() const override = 0; + const char* Name() const override = 0; // Determines whether the PartialMerge can be called with just a single // merge operand. diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 7eb88b157e9..d0a2d11ae76 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -443,6 +443,17 @@ struct CompactionServiceJobInfo { priority(priority_) {} }; +struct CompactionServiceScheduleResponse { + std::string scheduled_job_id; // Generated outside of primary host, unique + // across different DBs and sessions + CompactionServiceJobStatus status; + CompactionServiceScheduleResponse(std::string scheduled_job_id_, + CompactionServiceJobStatus status_) + : scheduled_job_id(scheduled_job_id_), status(status_) {} + explicit CompactionServiceScheduleResponse(CompactionServiceJobStatus status_) + : status(status_) {} +}; + // Exceptions MUST NOT propagate out of overridden functions into RocksDB, // because RocksDB is not exception-safe. This could cause undefined behavior // including data loss, unreported corruption, deadlocks, and more. @@ -453,6 +464,24 @@ class CompactionService : public Customizable { // Returns the name of this compaction service. const char* Name() const override = 0; + // Schedule compaction to be processed remotely. + virtual CompactionServiceScheduleResponse Schedule( + const CompactionServiceJobInfo& /*info*/, + const std::string& /*compaction_service_input*/) { + CompactionServiceScheduleResponse response( + CompactionServiceJobStatus::kUseLocal); + return response; + } + + // Wait for the scheduled compaction to finish from the remote worker + virtual CompactionServiceJobStatus Wait( + const std::string& /*scheduled_job_id*/, std::string* /*result*/) { + return CompactionServiceJobStatus::kUseLocal; + } + + // Deprecated. Please implement Schedule() and Wait() API to handle remote + // compaction + // Start the remote compaction with `compaction_service_input`, which can be // passed to `DB::OpenAndCompact()` on the remote side. `info` provides the // information the user might want to know, which includes `job_id`. @@ -577,6 +606,8 @@ struct DBOptions { // Default: true bool paranoid_checks = true; + // DEPRECATED: This option might be removed in a future release. + // // If true, during memtable flush, RocksDB will validate total entries // read in flush, and compare with counter inserted into it. // @@ -587,6 +618,8 @@ struct DBOptions { // Default: true bool flush_verify_memtable_count = true; + // DEPRECATED: This option might be removed in a future release. + // // If true, during compaction, RocksDB will count the number of entries // read and compare it against the number of entries in the compaction // input files. This is intended to add protection against corruption @@ -1025,15 +1058,6 @@ struct DBOptions { // Default: null std::shared_ptr write_buffer_manager = nullptr; - // DEPRECATED - // This flag has no effect on the behavior of compaction and we plan to delete - // it in the future. - // Specify the file access pattern once a compaction is started. - // It will be applied to all input files of a compaction. - // Default: NORMAL - enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED }; - AccessHint access_hint_on_compaction_start = NORMAL; - // If non-zero, we perform bigger reads when doing compaction. If you're // running RocksDB on spinning disks, you should set this to at least 2MB. // That way RocksDB's compaction is doing sequential instead of random reads. @@ -1281,6 +1305,8 @@ struct DBOptions { // currently. WalFilter* wal_filter = nullptr; + // DEPRECATED: This option might be removed in a future release. + // // If true, then DB::Open, CreateColumnFamily, DropColumnFamily, and // SetOptions will fail if options file is not properly persisted. // @@ -1341,10 +1367,11 @@ struct DBOptions { // file. bool manual_wal_flush = false; - // This feature is WORK IN PROGRESS - // If enabled WAL records will be compressed before they are written. - // Only zstd is supported. Compressed WAL records will be read in supported - // versions regardless of the wal_compression settings. + // If enabled WAL records will be compressed before they are written. Only + // ZSTD (= kZSTD) is supported (until streaming support is adapted for other + // compression types). Compressed WAL records will be read in supported + // versions (>= RocksDB 7.4.0 for ZSTD) regardless of this setting when + // the WAL is read. CompressionType wal_compression = kNoCompression; // If true, RocksDB supports flushing multiple column families and committing @@ -1408,6 +1435,13 @@ struct DBOptions { // is like applying WALRecoveryMode::kPointInTimeRecovery to each column // family rather than just the WAL. // + // The behavior changes in the presence of "AtomicGroup"s in the MANIFEST, + // which is currently only the case when `atomic_flush == true`. In that + // case, all pre-existing CFs must recover the atomic group in order for + // that group to be applied in an all-or-nothing manner. This means that + // unused/inactive CF(s) with invalid filesystem state can block recovery of + // all other CFs at an atomic group. + // // Best-efforts recovery (BER) is specifically designed to recover a DB with // files that are missing or truncated to some smaller size, such as the // result of an incomplete DB "physical" (FileSystem) copy. BER can also @@ -1425,8 +1459,6 @@ struct DBOptions { // setting. BER does require at least one valid MANIFEST to recover to a // non-trivial DB state, unlike `ldb repair`. // - // Currently, best_efforts_recovery=true is not compatible with atomic flush. - // // Default: false bool best_efforts_recovery = false; @@ -1515,6 +1547,8 @@ struct DBOptions { // Status: Experimental. std::shared_ptr replication_epoch_extractor = nullptr; + // DEPRECATED: This option might be removed in a future release. + // // If set to false, when compaction or flush sees a SingleDelete followed by // a Delete for the same user key, compaction job will not fail. // Otherwise, compaction job will fail. @@ -1863,23 +1897,17 @@ struct ReadOptions { // Default: empty (every table will be scanned) std::function table_filter; - // Experimental - // // If auto_readahead_size is set to true, it will auto tune the readahead_size // during scans internally. // For this feature to enabled, iterate_upper_bound must also be specified. // // NOTE: - Recommended for forward Scans only. - // - In case of backward scans like Prev or SeekForPrev, the - // cost of these backward operations might increase and affect the - // performace. So this option should not be enabled if workload - // contains backward scans. // - If there is a backward scans, this option will be - // disabled internally and won't be reset if forward scan is done - // again. + // disabled internally and won't be enabled again if the forward scan + // is issued again. // - // Default: false - bool auto_readahead_size = false; + // Default: true + bool auto_readahead_size = true; // *** END options only relevant to iterators or scans *** @@ -1913,7 +1941,7 @@ struct WriteOptions { // system call followed by "fdatasync()". // // Default: false - bool sync; + bool sync = false; // If true, writes will not first go to the write ahead log, // and the write may get lost after a crash. The backup engine @@ -1921,18 +1949,18 @@ struct WriteOptions { // you disable write-ahead logs, you must create backups with // flush_before_backup=true to avoid losing unflushed memtable data. // Default: false - bool disableWAL; + bool disableWAL = false; // If true and if user is trying to write to column families that don't exist // (they were dropped), ignore the write (don't return an error). If there // are multiple writes in a WriteBatch, other writes will succeed. // Default: false - bool ignore_missing_column_families; + bool ignore_missing_column_families = false; // If true and we need to wait or sleep for the write request, fails // immediately with Status::Incomplete(). // Default: false - bool no_slowdown; + bool no_slowdown = false; // If true, this write request is of lower priority if compaction is // behind. In this case, no_slowdown = true, the request will be canceled @@ -1941,10 +1969,10 @@ struct WriteOptions { // it introduces minimum impacts to high priority writes. // // Default: false - bool low_pri; + bool low_pri = false; // See comments for PreReleaseCallback - PreReleaseCallback* pre_release_callback; + PreReleaseCallback* pre_release_callback = nullptr; // If true, this writebatch will maintain the last insert positions of each // memtable as hints in concurrent write. It can improve write performance @@ -1953,7 +1981,7 @@ struct WriteOptions { // option will be ignored. // // Default: false - bool memtable_insert_hint_per_batch; + bool memtable_insert_hint_per_batch = false; // For writes associated with this option, charge the internal rate // limiter (see `DBOptions::rate_limiter`) at the specified priority. The @@ -1968,25 +1996,25 @@ struct WriteOptions { // due to implementation constraints. // // Default: `Env::IO_TOTAL` - Env::IOPriority rate_limiter_priority; + Env::IOPriority rate_limiter_priority = Env::IO_TOTAL; // `protection_bytes_per_key` is the number of bytes used to store // protection information for each key entry. Currently supported values are // zero (disabled) and eight. // // Default: zero (disabled). - size_t protection_bytes_per_key; - - WriteOptions() - : sync(false), - disableWAL(false), - ignore_missing_column_families(false), - no_slowdown(false), - low_pri(false), - pre_release_callback(nullptr), - memtable_insert_hint_per_batch(false), - rate_limiter_priority(Env::IO_TOTAL), - protection_bytes_per_key(0) {} + size_t protection_bytes_per_key = 0; + + // For RocksDB internal use only + // + // Default: Env::IOActivity::kUnknown. + Env::IOActivity io_activity = Env::IOActivity::kUnknown; + + WriteOptions() {} + explicit WriteOptions(Env::IOActivity _io_activity); + explicit WriteOptions( + Env::IOPriority _rate_limiter_priority, + Env::IOActivity _io_activity = Env::IOActivity::kUnknown); }; // Options that control flush operations @@ -2004,9 +2032,9 @@ struct FlushOptions { }; // Create a Logger from provided DBOptions -extern Status CreateLoggerFromOptions(const std::string& dbname, - const DBOptions& options, - std::shared_ptr* logger); +Status CreateLoggerFromOptions(const std::string& dbname, + const DBOptions& options, + std::shared_ptr* logger); // CompactionOptions are used in CompactFiles() call. struct CompactionOptions { @@ -2185,13 +2213,15 @@ struct IngestExternalFileOptions { // external file. bool unsafe_disable_sync = false; - // Set to TRUE if user wants file to be ingested to the bottommost level. An + // Set to TRUE if user wants file to be ingested to the last level. An // error of Status::TryAgain() will be returned if a file cannot fit in the - // bottommost level when calling + // last level when calling // DB::IngestExternalFile()/DB::IngestExternalFiles(). The user should clear - // the bottommost level in the overlapping range before re-attempt. + // the last level in the overlapping range before re-attempt. // // ingest_behind takes precedence over fail_if_not_bottommost_level. + // + // XXX: "bottommost" is obsolete/confusing terminology to refer to last level bool fail_if_not_bottommost_level = false; }; diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 216dd07d9be..b784765b608 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -83,6 +83,11 @@ struct PerfContextBase { uint64_t filter_block_read_count; // total number of filter block reads uint64_t compression_dict_block_read_count; // total number of compression // dictionary block reads + // Cumulative size of blocks found in block cache + uint64_t block_cache_index_read_byte; + uint64_t block_cache_filter_read_byte; + uint64_t block_cache_compression_dict_read_byte; + uint64_t block_cache_read_byte; // RocksDB-Cloud contribution begin diff --git a/include/rocksdb/perf_level.h b/include/rocksdb/perf_level.h index e7dded0e321..abd40faac25 100644 --- a/include/rocksdb/perf_level.h +++ b/include/rocksdb/perf_level.h @@ -18,13 +18,16 @@ enum PerfLevel : unsigned char { kUninitialized = 0, // unknown setting kDisable = 1, // disable perf stats kEnableCount = 2, // enable only count stats - kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time + kEnableWait = 3, // measure time spent by user threads + // blocked in RocksDB, and not external + // resources such as mutexes and IO + kEnableTimeExceptForMutex = 4, // Other than count stats, also enable time // stats except for mutexes // Other than time, also measure CPU time counters. Still don't measure // time (neither wall time nor CPU time) for mutexes. - kEnableTimeAndCPUTimeExceptForMutex = 4, - kEnableTime = 5, // enable count and time stats - kOutOfBounds = 6 // N.B. Must always be the last value! + kEnableTimeAndCPUTimeExceptForMutex = 5, + kEnableTime = 6, // enable count and time stats + kOutOfBounds = 7 // N.B. Must always be the last value! }; // set the perf stats level for current thread diff --git a/include/rocksdb/rate_limiter.h b/include/rocksdb/rate_limiter.h index 3515b1e953b..51383ba20ad 100644 --- a/include/rocksdb/rate_limiter.h +++ b/include/rocksdb/rate_limiter.h @@ -26,9 +26,9 @@ class RateLimiter { }; enum class Mode { - kReadsOnly, - kWritesOnly, - kAllIo, + kReadsOnly = 0, + kWritesOnly = 1, + kAllIo = 2, }; // For API compatibility, default to rate-limiting writes only. @@ -41,9 +41,10 @@ class RateLimiter { virtual void SetBytesPerSecond(int64_t bytes_per_second) = 0; // This API allows user to dynamically change the max bytes can be granted in - // a single refill period (i.e, burst) + // a single call to `Request()`. Zero is a special value meaning the number of + // bytes per refill. // - // REQUIRED: single_burst_bytes > 0. Otherwise `Status::InvalidArgument` will + // REQUIRED: single_burst_bytes >= 0. Otherwise `Status::InvalidArgument` will // be returned. virtual Status SetSingleBurstBytes(int64_t /* single_burst_bytes */) { return Status::NotSupported(); @@ -93,7 +94,7 @@ class RateLimiter { Env::IOPriority io_priority, Statistics* stats, RateLimiter::OpType op_type); - // Max bytes can be granted in a single burst + // Max bytes can be granted in a single call to `Request()`. virtual int64_t GetSingleBurstBytes() const = 0; // Total bytes that go through rate limiter @@ -143,11 +144,11 @@ class RateLimiter { // time. It controls the total write rate of compaction and flush in bytes per // second. Currently, RocksDB does not enforce rate limit for anything other // than flush and compaction, e.g. write to WAL. -// @refill_period_us: this controls how often tokens are refilled. For example, -// when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to -// 100ms, then 1MB is refilled every 100ms internally. Larger value can lead to -// burstier writes while smaller value introduces more CPU overhead. -// The default should work for most cases. +// @refill_period_us: This controls how often tokens are refilled. For example, +// when `rate_bytes_per_sec` is set to 10MB/s and +// `refill_period_us` is set to 100ms, then 1MB is refilled +// every 100ms internally. Larger values can lead to sporadic +// delays while smaller values introduce more CPU overhead. // @fairness: RateLimiter accepts high-pri requests and low-pri requests. // A low-pri request is usually blocked in favor of hi-pri request. Currently, // RocksDB assigns low-pri to request from compaction and high-pri to request @@ -159,10 +160,13 @@ class RateLimiter { // @auto_tuned: Enables dynamic adjustment of rate limit within the range // `[rate_bytes_per_sec / 20, rate_bytes_per_sec]`, according to // the recent demand for background I/O. -extern RateLimiter* NewGenericRateLimiter( +// @single_burst_bytes: The maximum number of bytes that can be granted in a +// single call to `Request()`. Zero is a special value +// meaning the number of bytes per refill. +RateLimiter* NewGenericRateLimiter( int64_t rate_bytes_per_sec, int64_t refill_period_us = 100 * 1000, int32_t fairness = 10, RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly, - bool auto_tuned = false); + bool auto_tuned = false, int64_t single_burst_bytes = 0); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/secondary_cache.h b/include/rocksdb/secondary_cache.h index 49792ca67a5..e8644c45469 100644 --- a/include/rocksdb/secondary_cache.h +++ b/include/rocksdb/secondary_cache.h @@ -114,7 +114,7 @@ class SecondaryCache : public Customizable { virtual std::unique_ptr Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool wait, bool advise_erase, - bool& kept_in_sec_cache) = 0; + Statistics* stats, bool& kept_in_sec_cache) = 0; // Indicate whether a handle can be erased in this secondary cache. [[nodiscard]] virtual bool SupportForceErase() const = 0; @@ -160,51 +160,49 @@ class SecondaryCacheWrapper : public SecondaryCache { explicit SecondaryCacheWrapper(std::shared_ptr target) : target_(std::move(target)) {} - virtual Status Insert(const Slice& key, Cache::ObjectPtr obj, - const Cache::CacheItemHelper* helper, - bool force_insert) override { + Status Insert(const Slice& key, Cache::ObjectPtr obj, + const Cache::CacheItemHelper* helper, + bool force_insert) override { return target()->Insert(key, obj, helper, force_insert); } - virtual Status InsertSaved( - const Slice& key, const Slice& saved, - CompressionType type = CompressionType::kNoCompression, - CacheTier source = CacheTier::kVolatileTier) override { + Status InsertSaved(const Slice& key, const Slice& saved, + CompressionType type = CompressionType::kNoCompression, + CacheTier source = CacheTier::kVolatileTier) override { return target()->InsertSaved(key, saved, type, source); } - virtual std::unique_ptr Lookup( + std::unique_ptr Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool wait, bool advise_erase, - bool& kept_in_sec_cache) override { + Statistics* stats, bool& kept_in_sec_cache) override { return target()->Lookup(key, helper, create_context, wait, advise_erase, - kept_in_sec_cache); + stats, kept_in_sec_cache); } - virtual bool SupportForceErase() const override { + bool SupportForceErase() const override { return target()->SupportForceErase(); } - virtual void Erase(const Slice& key) override { target()->Erase(key); } + void Erase(const Slice& key) override { target()->Erase(key); } - virtual void WaitAll( - std::vector handles) override { + void WaitAll(std::vector handles) override { target()->WaitAll(handles); } - virtual Status SetCapacity(size_t capacity) override { + Status SetCapacity(size_t capacity) override { return target()->SetCapacity(capacity); } - virtual Status GetCapacity(size_t& capacity) override { + Status GetCapacity(size_t& capacity) override { return target()->GetCapacity(capacity); } - virtual Status Deflate(size_t decrease) override { + Status Deflate(size_t decrease) override { return target()->Deflate(decrease); } - virtual Status Inflate(size_t increase) override { + Status Inflate(size_t increase) override { return target()->Inflate(increase); } diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h index 8909b9c5394..166cbc94b94 100644 --- a/include/rocksdb/slice_transform.h +++ b/include/rocksdb/slice_transform.h @@ -34,10 +34,10 @@ struct ConfigOptions; // including data loss, unreported corruption, deadlocks, and more. class SliceTransform : public Customizable { public: - virtual ~SliceTransform(){}; + virtual ~SliceTransform(){} // Return the name of this transformation. - virtual const char* Name() const override = 0; + const char* Name() const override = 0; static const char* Type() { return "SliceTransform"; } // Creates and configures a new SliceTransform from the input options and id. @@ -123,13 +123,13 @@ class SliceTransform : public Customizable { // The prefix is the first `prefix_len` bytes of the key, and keys shorter // then `prefix_len` are not InDomain. -extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len); +const SliceTransform* NewFixedPrefixTransform(size_t prefix_len); // The prefix is the first min(length(key),`cap_len`) bytes of the key, and // all keys are InDomain. -extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len); +const SliceTransform* NewCappedPrefixTransform(size_t cap_len); // Prefix is equal to key. All keys are InDomain. -extern const SliceTransform* NewNoopTransform(); +const SliceTransform* NewNoopTransform(); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/sst_file_manager.h b/include/rocksdb/sst_file_manager.h index b4e5a9bafa4..758207d784e 100644 --- a/include/rocksdb/sst_file_manager.h +++ b/include/rocksdb/sst_file_manager.h @@ -117,17 +117,19 @@ class SstFileManager { // `rate_bytes_per_sec` will be appreciated. NOTE that with this option, // files already renamed as a trash may be partial, so users should not // directly recover them without checking. -extern SstFileManager* NewSstFileManager( - Env* env, std::shared_ptr fs, - std::shared_ptr info_log = nullptr, - const std::string& trash_dir = "", int64_t rate_bytes_per_sec = 0, - bool delete_existing_trash = true, Status* status = nullptr, - double max_trash_db_ratio = 0.25, - uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024); +SstFileManager* NewSstFileManager(Env* env, std::shared_ptr fs, + std::shared_ptr info_log = nullptr, + const std::string& trash_dir = "", + int64_t rate_bytes_per_sec = 0, + bool delete_existing_trash = true, + Status* status = nullptr, + double max_trash_db_ratio = 0.25, + uint64_t bytes_max_delete_chunk = 64 * 1024 * + 1024); // Same as above, but takes a pointer to a legacy Env object, instead of // Env and FileSystem objects -extern SstFileManager* NewSstFileManager( +SstFileManager* NewSstFileManager( Env* env, std::shared_ptr info_log = nullptr, std::string trash_dir = "", int64_t rate_bytes_per_sec = 0, bool delete_existing_trash = true, Status* status = nullptr, diff --git a/include/rocksdb/sst_file_reader.h b/include/rocksdb/sst_file_reader.h index 026ae66d036..4e5cda130a2 100644 --- a/include/rocksdb/sst_file_reader.h +++ b/include/rocksdb/sst_file_reader.h @@ -32,14 +32,20 @@ class SstFileReader { std::shared_ptr GetTableProperties() const; // Verifies whether there is corruption in this table. + // For the default BlockBasedTable, this will verify the block + // checksum of each block. Status VerifyChecksum(const ReadOptions& /*read_options*/); + // TODO: plumb Env::IOActivity, Env::IOPriority Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } + // Verify that the number of entries in the table matches table property. + // A Corruption status is returned if they do not match. + Status VerifyNumEntries(const ReadOptions& /*read_options*/); + private: struct Rep; std::unique_ptr rep_; }; } // namespace ROCKSDB_NAMESPACE - diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h index 04d8bd0b8bc..1a6a8dd9b47 100644 --- a/include/rocksdb/sst_file_writer.h +++ b/include/rocksdb/sst_file_writer.h @@ -9,6 +9,7 @@ #include #include +#include "advanced_options.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/table_properties.h" @@ -77,6 +78,8 @@ struct ExternalSstFileInfo { // SstFileWriter is used to create sst files that can be added to database later // All keys in files generated by SstFileWriter will have sequence number = 0. +// +// This class is NOT thread-safe. class SstFileWriter { public: // User can pass `column_family` to specify that the generated file will @@ -115,7 +118,8 @@ class SstFileWriter { ~SstFileWriter(); // Prepare SstFileWriter to write into file located at "file_path". - Status Open(const std::string& file_path); + Status Open(const std::string& file_path, + Temperature temp = Temperature::kUnknown); // Add a Put key with value to currently opened file (deprecated) // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) @@ -133,6 +137,8 @@ class SstFileWriter { // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) // key according to the comparator. // REQUIRES: timestamp's size is equal to what is expected by the comparator. + // When Options.persist_user_defined_timestamps is set to false, only the + // minimum timestamp is accepted, and it will not be persisted. Status Put(const Slice& user_key, const Slice& timestamp, const Slice& value); // Add a PutEntity (key with the wide-column entity defined by "columns") to @@ -155,6 +161,8 @@ class SstFileWriter { // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) // key according to the comparator. // REQUIRES: timestamp's size is equal to what is expected by the comparator. + // When Options.persist_user_defined_timestamps is set to false, only the + // minimum timestamp is accepted, and it will not be persisted. Status Delete(const Slice& user_key, const Slice& timestamp); // Add a range deletion tombstone to currently opened file. Such a range @@ -180,6 +188,8 @@ class SstFileWriter { // REQUIRES: begin_key and end_key are user keys without timestamp. // REQUIRES: The comparator orders `begin_key` at or before `end_key` // REQUIRES: timestamp's size is equal to what is expected by the comparator. + // When Options.persist_user_defined_timestamps is set to false, only the + // minimum timestamp is accepted, and it will not be persisted. Status DeleteRange(const Slice& begin_key, const Slice& end_key, const Slice& timestamp); diff --git a/include/rocksdb/sst_partitioner.h b/include/rocksdb/sst_partitioner.h index 3af8e949297..21c906d1225 100644 --- a/include/rocksdb/sst_partitioner.h +++ b/include/rocksdb/sst_partitioner.h @@ -104,7 +104,7 @@ class SstPartitionerFixedPrefix : public SstPartitioner { public: explicit SstPartitionerFixedPrefix(size_t len) : len_(len) {} - virtual ~SstPartitionerFixedPrefix() override {} + ~SstPartitionerFixedPrefix() override {} const char* Name() const override { return "SstPartitionerFixedPrefix"; } @@ -136,7 +136,7 @@ class SstPartitionerFixedPrefixFactory : public SstPartitionerFactory { size_t len_; }; -extern std::shared_ptr -NewSstPartitionerFixedPrefixFactory(size_t prefix_len); +std::shared_ptr NewSstPartitionerFixedPrefixFactory( + size_t prefix_len); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index ecddf5c7a94..47bf8445fc9 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -72,6 +72,42 @@ enum Tickers : uint32_t { // # of bytes written into cache. BLOCK_CACHE_BYTES_WRITE, + BLOCK_CACHE_COMPRESSION_DICT_MISS, + BLOCK_CACHE_COMPRESSION_DICT_HIT, + BLOCK_CACHE_COMPRESSION_DICT_ADD, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, + + // # of blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_ADD_REDUNDANT <= BLOCK_CACHE_ADD + BLOCK_CACHE_ADD_REDUNDANT, + // # of index blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_INDEX_ADD_REDUNDANT <= BLOCK_CACHE_INDEX_ADD + BLOCK_CACHE_INDEX_ADD_REDUNDANT, + // # of filter blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_FILTER_ADD_REDUNDANT <= BLOCK_CACHE_FILTER_ADD + BLOCK_CACHE_FILTER_ADD_REDUNDANT, + // # of data blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_DATA_ADD_REDUNDANT <= BLOCK_CACHE_DATA_ADD + BLOCK_CACHE_DATA_ADD_REDUNDANT, + // # of dict blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT + // <= BLOCK_CACHE_COMPRESSION_DICT_ADD + BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT, + + // Secondary cache statistics + SECONDARY_CACHE_HITS, + + // Fine grained secondary cache stats + SECONDARY_CACHE_FILTER_HITS, + SECONDARY_CACHE_INDEX_HITS, + SECONDARY_CACHE_DATA_HITS, + + // Compressed secondary cache related stats + COMPRESSED_SECONDARY_CACHE_DUMMY_HITS, + COMPRESSED_SECONDARY_CACHE_HITS, + COMPRESSED_SECONDARY_CACHE_PROMOTIONS, + COMPRESSED_SECONDARY_CACHE_PROMOTION_SKIPS, + // # of times bloom filter has avoided file reads, i.e., negatives. BLOOM_FILTER_USEFUL, // # of times bloom FullFilter has not avoided the reads. @@ -79,6 +115,16 @@ enum Tickers : uint32_t { // # of times bloom FullFilter has not avoided the reads and data actually // exist. BLOOM_FILTER_FULL_TRUE_POSITIVE, + // Prefix filter stats when used for point lookups (Get / MultiGet). + // (For prefix filter stats on iterators, see *_LEVEL_SEEK_*.) + // Checked: filter was queried + BLOOM_FILTER_PREFIX_CHECKED, + // Useful: filter returned false so prevented accessing data+index blocks + BLOOM_FILTER_PREFIX_USEFUL, + // True positive: found a key matching the point query. When another key + // with the same prefix matches, it is considered a false positive by + // these statistics even though the filter returned a true positive. + BLOOM_FILTER_PREFIX_TRUE_POSITIVE, // # persistent cache hit PERSISTENT_CACHE_HIT, @@ -142,6 +188,15 @@ enum Tickers : uint32_t { // The number of uncompressed bytes read from an iterator. // Includes size of key and value. ITER_BYTES_READ, + // Number of internal keys skipped by Iterator + NUMBER_ITER_SKIP, + // Number of times we had to reseek inside an iteration to skip + // over large number of keys with same userkey. + NUMBER_OF_RESEEKS_IN_ITERATION, + + NO_ITERATOR_CREATED, // number of iterators created + NO_ITERATOR_DELETED, // number of iterators deleted + NO_FILE_OPENS, NO_FILE_ERRORS, // Writer has to wait for compaction or flush to finish. @@ -154,24 +209,13 @@ enum Tickers : uint32_t { NUMBER_MULTIGET_CALLS, NUMBER_MULTIGET_KEYS_READ, NUMBER_MULTIGET_BYTES_READ, + // Number of keys actually found in MultiGet calls (vs number requested by + // caller) + // NUMBER_MULTIGET_KEYS_READ gives the number requested by caller + NUMBER_MULTIGET_KEYS_FOUND, NUMBER_MERGE_FAILURES, - // Prefix filter stats when used for point lookups (Get / MultiGet). - // (For prefix filter stats on iterators, see *_LEVEL_SEEK_*.) - // Checked: filter was queried - BLOOM_FILTER_PREFIX_CHECKED, - // Useful: filter returned false so prevented accessing data+index blocks - BLOOM_FILTER_PREFIX_USEFUL, - // True positive: found a key matching the point query. When another key - // with the same prefix matches, it is considered a false positive by - // these statistics even though the filter returned a true positive. - BLOOM_FILTER_PREFIX_TRUE_POSITIVE, - - // Number of times we had to reseek inside an iteration to skip - // over large number of keys with same userkey. - NUMBER_OF_RESEEKS_IN_ITERATION, - // Record the number of calls to GetUpdatesSince. Useful to keep track of // transaction log iterator refreshes GET_UPDATES_SINCE_CALLS, @@ -206,8 +250,35 @@ enum Tickers : uint32_t { NUMBER_BLOCK_COMPRESSED, NUMBER_BLOCK_DECOMPRESSED, - // DEPRECATED / unused (see NUMBER_BLOCK_COMPRESSION_*) - NUMBER_BLOCK_NOT_COMPRESSED, + // Number of input bytes (uncompressed) to compression for SST blocks that + // are stored compressed. + BYTES_COMPRESSED_FROM, + // Number of output bytes (compressed) from compression for SST blocks that + // are stored compressed. + BYTES_COMPRESSED_TO, + // Number of uncompressed bytes for SST blocks that are stored uncompressed + // because compression type is kNoCompression, or some error case caused + // compression not to run or produce an output. Index blocks are only counted + // if enable_index_compression is true. + BYTES_COMPRESSION_BYPASSED, + // Number of input bytes (uncompressed) to compression for SST blocks that + // are stored uncompressed because the compression result was rejected, + // either because the ratio was not acceptable (see + // CompressionOptions::max_compressed_bytes_per_kb) or found invalid by the + // `verify_compression` option. + BYTES_COMPRESSION_REJECTED, + + // Like BYTES_COMPRESSION_BYPASSED but counting number of blocks + NUMBER_BLOCK_COMPRESSION_BYPASSED, + // Like BYTES_COMPRESSION_REJECTED but counting number of blocks + NUMBER_BLOCK_COMPRESSION_REJECTED, + + // Number of input bytes (compressed) to decompression in reading compressed + // SST blocks from storage. + BYTES_DECOMPRESSED_FROM, + // Number of output bytes (uncompressed) from decompression in reading + // compressed SST blocks from storage. + BYTES_DECOMPRESSED_TO, // Tickers that record cumulative time. MERGE_OPERATION_TOTAL_TIME, @@ -229,9 +300,6 @@ enum Tickers : uint32_t { // Number of refill intervals where rate limiter's bytes are fully consumed. NUMBER_RATE_LIMITER_DRAINS, - // Number of internal keys skipped by Iterator - NUMBER_ITER_SKIP, - // BlobDB specific stats // # of Put/PutTTL/PutUntil to BlobDB. Only applicable to legacy BlobDB. BLOB_DB_NUM_PUT, @@ -310,6 +378,20 @@ enum Tickers : uint32_t { // applicable to legacy BlobDB. BLOB_DB_FIFO_BYTES_EVICTED, + // Integrated BlobDB specific stats + // # of times cache miss when accessing blob from blob cache. + BLOB_DB_CACHE_MISS, + // # of times cache hit when accessing blob from blob cache. + BLOB_DB_CACHE_HIT, + // # of data blocks added to blob cache. + BLOB_DB_CACHE_ADD, + // # of failures when adding blobs to blob cache. + BLOB_DB_CACHE_ADD_FAILURES, + // # of bytes read from blob cache. + BLOB_DB_CACHE_BYTES_READ, + // # of bytes written into blob cache. + BLOB_DB_CACHE_BYTES_WRITE, + // These counters indicate a performance issue in WritePrepared transactions. // We should not seem them ticking them much. // # of times prepare_mutex_ is acquired in the fast path. @@ -323,36 +405,6 @@ enum Tickers : uint32_t { // # of times ::Get returned TryAgain due to expired snapshot seq TXN_GET_TRY_AGAIN, - // Number of keys actually found in MultiGet calls (vs number requested by - // caller) - // NUMBER_MULTIGET_KEYS_READ gives the number requested by caller - NUMBER_MULTIGET_KEYS_FOUND, - - NO_ITERATOR_CREATED, // number of iterators created - NO_ITERATOR_DELETED, // number of iterators deleted - - BLOCK_CACHE_COMPRESSION_DICT_MISS, - BLOCK_CACHE_COMPRESSION_DICT_HIT, - BLOCK_CACHE_COMPRESSION_DICT_ADD, - BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, - - // # of blocks redundantly inserted into block cache. - // REQUIRES: BLOCK_CACHE_ADD_REDUNDANT <= BLOCK_CACHE_ADD - BLOCK_CACHE_ADD_REDUNDANT, - // # of index blocks redundantly inserted into block cache. - // REQUIRES: BLOCK_CACHE_INDEX_ADD_REDUNDANT <= BLOCK_CACHE_INDEX_ADD - BLOCK_CACHE_INDEX_ADD_REDUNDANT, - // # of filter blocks redundantly inserted into block cache. - // REQUIRES: BLOCK_CACHE_FILTER_ADD_REDUNDANT <= BLOCK_CACHE_FILTER_ADD - BLOCK_CACHE_FILTER_ADD_REDUNDANT, - // # of data blocks redundantly inserted into block cache. - // REQUIRES: BLOCK_CACHE_DATA_ADD_REDUNDANT <= BLOCK_CACHE_DATA_ADD - BLOCK_CACHE_DATA_ADD_REDUNDANT, - // # of dict blocks redundantly inserted into block cache. - // REQUIRES: BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT - // <= BLOCK_CACHE_COMPRESSION_DICT_ADD - BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT, - // # of files marked as trash by sst file manager and will be deleted // later by background thread. FILES_MARKED_TRASH, @@ -362,16 +414,11 @@ enum Tickers : uint32_t { // scheduler. FILES_DELETED_IMMEDIATELY, - // The counters for error handler, not that, bg_io_error is the subset of + // The counters for error handler, note that, bg_io_error is the subset of // bg_error and bg_retryable_io_error is the subset of bg_io_error. - // The misspelled versions are deprecated and only kept for compatibility. - // TODO: remove the misspelled tickers in the next major release. ERROR_HANDLER_BG_ERROR_COUNT, - ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED, ERROR_HANDLER_BG_IO_ERROR_COUNT, - ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED, ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, - ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT_MISSPELLED, ERROR_HANDLER_AUTORESUME_COUNT, ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT, @@ -382,9 +429,6 @@ enum Tickers : uint32_t { // Outdated bytes of data present on memtable at flush time. MEMTABLE_GARBAGE_BYTES_AT_FLUSH, - // Secondary cache statistics - SECONDARY_CACHE_HITS, - // Bytes read by `VerifyChecksum()` and `VerifyFileChecksums()` APIs. VERIFY_CHECKSUM_READ_BYTES, @@ -445,30 +489,11 @@ enum Tickers : uint32_t { MULTIGET_COROUTINE_COUNT, - // Integrated BlobDB specific stats - // # of times cache miss when accessing blob from blob cache. - BLOB_DB_CACHE_MISS, - // # of times cache hit when accessing blob from blob cache. - BLOB_DB_CACHE_HIT, - // # of data blocks added to blob cache. - BLOB_DB_CACHE_ADD, - // # of failures when adding blobs to blob cache. - BLOB_DB_CACHE_ADD_FAILURES, - // # of bytes read from blob cache. - BLOB_DB_CACHE_BYTES_READ, - // # of bytes written into blob cache. - BLOB_DB_CACHE_BYTES_WRITE, - // Time spent in the ReadAsync file system call READ_ASYNC_MICROS, // Number of errors returned to the async read callback ASYNC_READ_ERROR_COUNT, - // Fine grained secondary cache stats - SECONDARY_CACHE_FILTER_HITS, - SECONDARY_CACHE_INDEX_HITS, - SECONDARY_CACHE_DATA_HITS, - // Number of lookup into the prefetched tail (see // `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) // that can't find its data for table open @@ -484,36 +509,6 @@ enum Tickers : uint32_t { // # of times timestamps can successfully help skip the table access TIMESTAMP_FILTER_TABLE_FILTERED, - // Number of input bytes (uncompressed) to compression for SST blocks that - // are stored compressed. - BYTES_COMPRESSED_FROM, - // Number of output bytes (compressed) from compression for SST blocks that - // are stored compressed. - BYTES_COMPRESSED_TO, - // Number of uncompressed bytes for SST blocks that are stored uncompressed - // because compression type is kNoCompression, or some error case caused - // compression not to run or produce an output. Index blocks are only counted - // if enable_index_compression is true. - BYTES_COMPRESSION_BYPASSED, - // Number of input bytes (uncompressed) to compression for SST blocks that - // are stored uncompressed because the compression result was rejected, - // either because the ratio was not acceptable (see - // CompressionOptions::max_compressed_bytes_per_kb) or found invalid by the - // `verify_compression` option. - BYTES_COMPRESSION_REJECTED, - - // Like BYTES_COMPRESSION_BYPASSED but counting number of blocks - NUMBER_BLOCK_COMPRESSION_BYPASSED, - // Like BYTES_COMPRESSION_REJECTED but counting number of blocks - NUMBER_BLOCK_COMPRESSION_REJECTED, - - // Number of input bytes (compressed) to decompression in reading compressed - // SST blocks from storage. - BYTES_DECOMPRESSED_FROM, - // Number of output bytes (uncompressed) from decompression in reading - // compressed SST blocks from storage. - BYTES_DECOMPRESSED_TO, - // Number of times readahead is trimmed during scans when // ReadOptions.auto_readahead_size is set. READAHEAD_TRIMMED, @@ -531,6 +526,9 @@ enum Tickers : uint32_t { // Number of FS reads avoided due to scan prefetching PREFETCH_HITS, + // Footer corruption detected when opening an SST file for reading + SST_FOOTER_CORRUPTION_COUNT, + TICKER_ENUM_MAX }; @@ -583,6 +581,14 @@ enum Histograms : uint32_t { FILE_READ_VERIFY_DB_CHECKSUM_MICROS, FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS, + // Time spent in writing SST files + SST_WRITE_MICROS, + // Time spent in writing SST table (currently only block-based table) or blob + // file for flush, compaction or db open + FILE_WRITE_FLUSH_MICROS, + FILE_WRITE_COMPACTION_MICROS, + FILE_WRITE_DB_OPEN_MICROS, + // The number of subcompactions actually scheduled during a compaction NUM_SUBCOMPACTIONS_SCHEDULED, // Value size distribution in each operation @@ -590,8 +596,6 @@ enum Histograms : uint32_t { BYTES_PER_WRITE, BYTES_PER_MULTIGET, - BYTES_COMPRESSED, // DEPRECATED / unused (see BYTES_COMPRESSED_{FROM,TO}) - BYTES_DECOMPRESSED, // DEPRECATED / unused (see BYTES_DECOMPRESSED_{FROM,TO}) COMPRESSION_TIMES_NANOS, DECOMPRESSION_TIMES_NANOS, // Number of merge operands passed to the merge operator in user read @@ -631,11 +635,15 @@ enum Histograms : uint32_t { FLUSH_TIME, SST_BATCH_SIZE, + // Number of IOs issued in parallel in a MultiGet batch + MULTIGET_IO_BATCH_SIZE, // MultiGet stats logged per level // Num of index and filter blocks read from file system per level. NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, // Num of sst files read from file system per level. NUM_SST_READ_PER_LEVEL, + // Number of levels requiring IO for MultiGet + NUM_LEVEL_READ_PER_MULTIGET, // Error handler statistics ERROR_HANDLER_AUTORESUME_RETRY_COUNT, @@ -647,12 +655,6 @@ enum Histograms : uint32_t { // Number of prefetched bytes discarded by RocksDB. PREFETCHED_BYTES_DISCARDED, - // Number of IOs issued in parallel in a MultiGet batch - MULTIGET_IO_BATCH_SIZE, - - // Number of levels requiring IO for MultiGet - NUM_LEVEL_READ_PER_MULTIGET, - // Wait time for aborting async read in FilePrefetchBuffer destructor ASYNC_PREFETCH_ABORT_MICROS, diff --git a/include/rocksdb/system_clock.h b/include/rocksdb/system_clock.h index c4cfcecb552..ddff0fd2a22 100644 --- a/include/rocksdb/system_clock.h +++ b/include/rocksdb/system_clock.h @@ -36,7 +36,7 @@ class SystemClock : public Customizable { const std::string& value, std::shared_ptr* result); // The name of this system clock - virtual const char* Name() const override = 0; + const char* Name() const override = 0; // The name/nickname for the Default SystemClock. This name can be used // to determine if the clock is the default one. @@ -100,12 +100,12 @@ class SystemClockWrapper : public SystemClock { uint64_t CPUNanos() override { return target_->CPUNanos(); } - virtual void SleepForMicroseconds(int micros) override { + void SleepForMicroseconds(int micros) override { return target_->SleepForMicroseconds(micros); } - virtual bool TimedWait(port::CondVar* cv, - std::chrono::microseconds deadline) override { + bool TimedWait(port::CondVar* cv, + std::chrono::microseconds deadline) override { return target_->TimedWait(cv, deadline); } diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index d19a95fa8e4..b0b276a6396 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -127,7 +127,7 @@ struct CacheUsageOptions { // For advanced user only struct BlockBasedTableOptions { - static const char* kName() { return "BlockTableOptions"; }; + static const char* kName() { return "BlockTableOptions"; } // @flush_block_policy_factory creates the instances of flush block policy. // which provides a configurable way to determine when to flush a block in // the block based tables. If not set, table builder will use the default @@ -518,7 +518,15 @@ struct BlockBasedTableOptions { // 6 -- Modified the file footer and checksum matching so that SST data // misplaced within or between files is as likely to fail checksum // verification as random corruption. Also checksum-protects SST footer. - uint32_t format_version = 5; + // Can be read by RocksDB versions >= 8.6.0. + // + // Using the default setting of format_version is strongly recommended, so + // that available enhancements are adopted eventually and automatically. The + // default setting will only update to the latest after thorough production + // validation and sufficient time and number of releases have elapsed + // (6 months recommended) to ensure a clean downgrade/revert path for users + // who might only upgrade a few times per year. + uint32_t format_version = 6; // Store index blocks on disk in compressed format. Changing this option to // false will avoid the overhead of decompression if index blocks are evicted @@ -674,10 +682,9 @@ struct BlockBasedTablePropertyNames { }; // Create default block based table factory. -extern TableFactory* NewBlockBasedTableFactory( +TableFactory* NewBlockBasedTableFactory( const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); - enum EncodingType : char { // Always write full keys without any special encoding. kPlain, @@ -705,7 +712,7 @@ struct PlainTablePropertyNames { const uint32_t kPlainTableVariableLength = 0; struct PlainTableOptions { - static const char* kName() { return "PlainTableOptions"; }; + static const char* kName() { return "PlainTableOptions"; } // @user_key_len: plain table has optimization for fix-sized keys, which can // be specified via user_key_len. Alternatively, you can pass // `kPlainTableVariableLength` if your keys have variable @@ -763,7 +770,7 @@ struct PlainTableOptions { // the hash bucket found, a binary search is executed for hash conflicts. // Finally, a linear search is used. -extern TableFactory* NewPlainTableFactory( +TableFactory* NewPlainTableFactory( const PlainTableOptions& options = PlainTableOptions()); struct CuckooTablePropertyNames { @@ -799,7 +806,7 @@ struct CuckooTablePropertyNames { }; struct CuckooTableOptions { - static const char* kName() { return "CuckooTableOptions"; }; + static const char* kName() { return "CuckooTableOptions"; } // Determines the utilization of hash tables. Smaller values // result in larger hash tables with fewer collisions. @@ -830,21 +837,20 @@ struct CuckooTableOptions { }; // Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing -extern TableFactory* NewCuckooTableFactory( +TableFactory* NewCuckooTableFactory( const CuckooTableOptions& table_options = CuckooTableOptions()); - class RandomAccessFileReader; // A base class for table factories. class TableFactory : public Customizable { public: - virtual ~TableFactory() override {} + ~TableFactory() override {} - static const char* kBlockCacheOpts() { return "BlockCache"; }; - static const char* kBlockBasedTableName() { return "BlockBasedTable"; }; + static const char* kBlockCacheOpts() { return "BlockCache"; } + static const char* kBlockBasedTableName() { return "BlockBasedTable"; } static const char* kPlainTableName() { return "PlainTable"; } - static const char* kCuckooTableName() { return "CuckooTable"; }; + static const char* kCuckooTableName() { return "CuckooTable"; } // Creates and configures a new TableFactory from the input options and id. static Status CreateFromString(const ConfigOptions& config_options, @@ -923,11 +929,10 @@ class TableFactory : public Customizable { // @plain_table_factory: plain table factory to use. If NULL, use a default one. // @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default // one. -extern TableFactory* NewAdaptiveTableFactory( +TableFactory* NewAdaptiveTableFactory( std::shared_ptr table_factory_to_write = nullptr, std::shared_ptr block_based_table_factory = nullptr, std::shared_ptr plain_table_factory = nullptr, std::shared_ptr cuckoo_table_factory = nullptr); - } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index 052df35035b..f444275b96a 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -16,6 +16,8 @@ namespace ROCKSDB_NAMESPACE { +class InternalTblPropColl; + // -- Table Properties // Other than basic table properties, each table may also have the user // collected properties. @@ -138,10 +140,12 @@ class TablePropertiesCollector { // EXPERIMENTAL Return whether the output file should be further compacted virtual bool NeedCompact() const { return false; } + + // For internal use only. + virtual InternalTblPropColl* AsInternal() { return nullptr; } }; -// Constructs TablePropertiesCollector. Internals create a new -// TablePropertiesCollector for each new table +// Constructs TablePropertiesCollector instances for each table file creation. // // Exceptions MUST NOT propagate out of overridden functions into RocksDB, // because RocksDB is not exception-safe. This could cause undefined behavior @@ -163,7 +167,12 @@ class TablePropertiesCollectorFactory : public Customizable { const ConfigOptions& options, const std::string& value, std::shared_ptr* result); - // has to be thread-safe + // To collect properties of a table with the given context, returns + // a new object inheriting from TablePropertiesCollector. The caller + // is responsible for deleting the object returned. Alternatively, + // nullptr may be returned to decline collecting properties for the + // file (and reduce callback overheads). + // MUST be thread-safe. virtual TablePropertiesCollector* CreateTablePropertiesCollector( TablePropertiesCollectorFactory::Context context) = 0; @@ -345,8 +354,8 @@ struct TableProperties { // DEPRECATED: these properties now belong as TableProperties members. Please // use TableProperties::num_deletions and TableProperties::num_merge_operands, // respectively. -extern uint64_t GetDeletedKeys(const UserCollectedProperties& props); -extern uint64_t GetMergeOperands(const UserCollectedProperties& props, - bool* property_present); +uint64_t GetDeletedKeys(const UserCollectedProperties& props); +uint64_t GetMergeOperands(const UserCollectedProperties& props, + bool* property_present); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/threadpool.h b/include/rocksdb/threadpool.h index f1cc5575248..a1f2b85779d 100644 --- a/include/rocksdb/threadpool.h +++ b/include/rocksdb/threadpool.h @@ -62,6 +62,6 @@ class ThreadPool { // NewThreadPool() is a function that could be used to create a ThreadPool // with `num_threads` background threads. -extern ThreadPool* NewThreadPool(int num_threads); +ThreadPool* NewThreadPool(int num_threads); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/trace_record.h b/include/rocksdb/trace_record.h index c00f5cafbe7..8f9c3ee2f0f 100644 --- a/include/rocksdb/trace_record.h +++ b/include/rocksdb/trace_record.h @@ -104,7 +104,7 @@ class WriteQueryTraceRecord : public QueryTraceRecord { WriteQueryTraceRecord(const std::string& write_batch_rep, uint64_t timestamp); - virtual ~WriteQueryTraceRecord() override; + ~WriteQueryTraceRecord() override; TraceType GetTraceType() const override { return kTraceWrite; } @@ -127,7 +127,7 @@ class GetQueryTraceRecord : public QueryTraceRecord { GetQueryTraceRecord(uint32_t column_family_id, const std::string& key, uint64_t timestamp); - virtual ~GetQueryTraceRecord() override; + ~GetQueryTraceRecord() override; TraceType GetTraceType() const override { return kTraceGet; } @@ -156,7 +156,7 @@ class IteratorQueryTraceRecord : public QueryTraceRecord { IteratorQueryTraceRecord(const std::string& lower_bound, const std::string& upper_bound, uint64_t timestamp); - virtual ~IteratorQueryTraceRecord() override; + ~IteratorQueryTraceRecord() override; // Get the iterator's lower/upper bound. They may be used in ReadOptions to // create an Iterator instance. @@ -193,7 +193,7 @@ class IteratorSeekQueryTraceRecord : public IteratorQueryTraceRecord { const std::string& upper_bound, uint64_t timestamp); - virtual ~IteratorSeekQueryTraceRecord() override; + ~IteratorSeekQueryTraceRecord() override; // Trace type matches the seek type. TraceType GetTraceType() const override; @@ -227,7 +227,7 @@ class MultiGetQueryTraceRecord : public QueryTraceRecord { const std::vector& keys, uint64_t timestamp); - virtual ~MultiGetQueryTraceRecord() override; + ~MultiGetQueryTraceRecord() override; TraceType GetTraceType() const override { return kTraceMultiGet; } diff --git a/include/rocksdb/trace_record_result.h b/include/rocksdb/trace_record_result.h index 0cd0004a6a7..d5b29a782e6 100644 --- a/include/rocksdb/trace_record_result.h +++ b/include/rocksdb/trace_record_result.h @@ -86,12 +86,12 @@ class StatusOnlyTraceExecutionResult : public TraceExecutionResult { StatusOnlyTraceExecutionResult(Status status, uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type); - virtual ~StatusOnlyTraceExecutionResult() override = default; + ~StatusOnlyTraceExecutionResult() override = default; // Return value of DB::Write(), etc. virtual const Status& GetStatus() const; - virtual Status Accept(Handler* handler) override; + Status Accept(Handler* handler) override; private: Status status_; @@ -109,7 +109,7 @@ class SingleValueTraceExecutionResult : public TraceExecutionResult { uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type); - virtual ~SingleValueTraceExecutionResult() override; + ~SingleValueTraceExecutionResult() override; // Return status of DB::Get(). virtual const Status& GetStatus() const; @@ -117,7 +117,7 @@ class SingleValueTraceExecutionResult : public TraceExecutionResult { // Value for the searched key. virtual const std::string& GetValue() const; - virtual Status Accept(Handler* handler) override; + Status Accept(Handler* handler) override; private: Status status_; @@ -133,7 +133,7 @@ class MultiValuesTraceExecutionResult : public TraceExecutionResult { uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type); - virtual ~MultiValuesTraceExecutionResult() override; + ~MultiValuesTraceExecutionResult() override; // Returned Status(es) of DB::MultiGet(). virtual const std::vector& GetMultiStatus() const; @@ -141,7 +141,7 @@ class MultiValuesTraceExecutionResult : public TraceExecutionResult { // Returned values for the searched keys. virtual const std::vector& GetValues() const; - virtual Status Accept(Handler* handler) override; + Status Accept(Handler* handler) override; private: std::vector multi_status_; @@ -161,7 +161,7 @@ class IteratorTraceExecutionResult : public TraceExecutionResult { uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type); - virtual ~IteratorTraceExecutionResult() override; + ~IteratorTraceExecutionResult() override; // Return if the Iterator is valid. virtual bool GetValid() const; @@ -175,7 +175,7 @@ class IteratorTraceExecutionResult : public TraceExecutionResult { // Value of the current iterating entry, empty if GetValid() is false. virtual Slice GetValue() const; - virtual Status Accept(Handler* handler) override; + Status Accept(Handler* handler) override; private: bool valid_; diff --git a/include/rocksdb/utilities/env_mirror.h b/include/rocksdb/utilities/env_mirror.h index 2a126128708..01293f0d0cc 100644 --- a/include/rocksdb/utilities/env_mirror.h +++ b/include/rocksdb/utilities/env_mirror.h @@ -55,8 +55,8 @@ class EnvMirror : public EnvWrapper { const std::string& old_fname, std::unique_ptr* r, const EnvOptions& options) override; - virtual Status NewDirectory(const std::string& name, - std::unique_ptr* result) override { + Status NewDirectory(const std::string& name, + std::unique_ptr* result) override { std::unique_ptr br; Status as = a_->NewDirectory(name, result); Status bs = b_->NewDirectory(name, &br); diff --git a/include/rocksdb/utilities/optimistic_transaction_db.h b/include/rocksdb/utilities/optimistic_transaction_db.h index 0925eaf0a34..875a132e408 100644 --- a/include/rocksdb/utilities/optimistic_transaction_db.h +++ b/include/rocksdb/utilities/optimistic_transaction_db.h @@ -102,7 +102,7 @@ class OptimisticTransactionDB : public StackableDB { std::vector* handles, OptimisticTransactionDB** dbptr); - virtual ~OptimisticTransactionDB() {} + ~OptimisticTransactionDB() override {} // Starts a new Transaction. // diff --git a/include/rocksdb/utilities/options_type.h b/include/rocksdb/utilities/options_type.h index 782b14e652a..aea24526c03 100644 --- a/include/rocksdb/utilities/options_type.h +++ b/include/rocksdb/utilities/options_type.h @@ -299,9 +299,9 @@ class OptionTypeInfo { template static OptionTypeInfo Enum( int offset, const std::unordered_map* const map, - OptionTypeFlags flags = OptionTypeFlags::kNone) { - OptionTypeInfo info(offset, OptionType::kEnum, - OptionVerificationType::kNormal, flags); + OptionTypeFlags flags = OptionTypeFlags::kNone, + OptionVerificationType verification = OptionVerificationType::kNormal) { + OptionTypeInfo info(offset, OptionType::kEnum, verification, flags); info.SetParseFunc( // Uses the map argument to convert the input string into // its corresponding enum value. If value is found in the map, diff --git a/include/rocksdb/utilities/sim_cache.h b/include/rocksdb/utilities/sim_cache.h index 6c52453e7e9..f3508906a65 100644 --- a/include/rocksdb/utilities/sim_cache.h +++ b/include/rocksdb/utilities/sim_cache.h @@ -34,13 +34,12 @@ class SimCache; // BlockBasedTableOptions.block_size = 4096 by default but is configurable, // Therefore, generally the actual memory overhead of SimCache is Less than // sim_capacity * 2% -extern std::shared_ptr NewSimCache(std::shared_ptr cache, - size_t sim_capacity, - int num_shard_bits); +std::shared_ptr NewSimCache(std::shared_ptr cache, + size_t sim_capacity, int num_shard_bits); -extern std::shared_ptr NewSimCache(std::shared_ptr sim_cache, - std::shared_ptr cache, - int num_shard_bits); +std::shared_ptr NewSimCache(std::shared_ptr sim_cache, + std::shared_ptr cache, + int num_shard_bits); // An abstract base class (public interface) to the SimCache implementation class SimCache : public CacheWrapper { diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index ad7f38180f3..bf59509577b 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -27,7 +27,7 @@ class StackableDB : public DB { explicit StackableDB(std::shared_ptr db) : db_(db.get()), shared_db_ptr_(db) {} - ~StackableDB() { + ~StackableDB() override { if (shared_db_ptr_ == nullptr) { delete db_; } else { @@ -36,49 +36,47 @@ class StackableDB : public DB { db_ = nullptr; } - virtual Status Close() override { return db_->Close(); } + Status Close() override { return db_->Close(); } virtual DB* GetBaseDB() { return db_; } - virtual DB* GetRootDB() override { return db_->GetRootDB(); } + DB* GetRootDB() override { return db_->GetRootDB(); } - virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, - const std::string& column_family_name, - ColumnFamilyHandle** handle) override { + Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override { return db_->CreateColumnFamily(options, column_family_name, handle); } - virtual Status CreateColumnFamilies( + Status CreateColumnFamilies( const ColumnFamilyOptions& options, const std::vector& column_family_names, std::vector* handles) override { return db_->CreateColumnFamilies(options, column_family_names, handles); } - virtual Status CreateColumnFamilies( + Status CreateColumnFamilies( const std::vector& column_families, std::vector* handles) override { return db_->CreateColumnFamilies(column_families, handles); } - virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override { + Status DropColumnFamily(ColumnFamilyHandle* column_family) override { return db_->DropColumnFamily(column_family); } - virtual Status DropColumnFamilies( + Status DropColumnFamilies( const std::vector& column_families) override { return db_->DropColumnFamilies(column_families); } - virtual Status DestroyColumnFamilyHandle( - ColumnFamilyHandle* column_family) override { + Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) override { return db_->DestroyColumnFamilyHandle(column_family); } using DB::Put; - virtual Status Put(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& val) override { + Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) override { return db_->Put(options, column_family, key, val); } Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, @@ -98,10 +96,10 @@ class StackableDB : public DB { } using DB::Get; - virtual Status Get(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override { - return db_->Get(options, column_family, key, value); + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + std::string* timestamp) override { + return db_->Get(options, column_family, key, value, timestamp); } using DB::GetEntity; @@ -112,32 +110,23 @@ class StackableDB : public DB { } using DB::GetMergeOperands; - virtual Status GetMergeOperands( - const ReadOptions& options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* slice, - GetMergeOperandsOptions* get_merge_operands_options, - int* number_of_operands) override { + Status GetMergeOperands(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* slice, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) override { return db_->GetMergeOperands(options, column_family, key, slice, get_merge_operands_options, number_of_operands); } using DB::MultiGet; - virtual std::vector MultiGet( - const ReadOptions& options, - const std::vector& column_family, - const std::vector& keys, - std::vector* values) override { - return db_->MultiGet(options, column_family, keys, values); - } - - virtual void MultiGet(const ReadOptions& options, - ColumnFamilyHandle* column_family, - const size_t num_keys, const Slice* keys, - PinnableSlice* values, Status* statuses, - const bool sorted_input = false) override { - return db_->MultiGet(options, column_family, num_keys, keys, values, - statuses, sorted_input); + void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool sorted_input = false) override { + return db_->MultiGet(options, num_keys, column_families, keys, values, + timestamps, statuses, sorted_input); } using DB::MultiGetEntity; @@ -159,21 +148,20 @@ class StackableDB : public DB { } using DB::IngestExternalFile; - virtual Status IngestExternalFile( - ColumnFamilyHandle* column_family, - const std::vector& external_files, - const IngestExternalFileOptions& options) override { + Status IngestExternalFile(ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& options) override { return db_->IngestExternalFile(column_family, external_files, options); } using DB::IngestExternalFiles; - virtual Status IngestExternalFiles( + Status IngestExternalFiles( const std::vector& args) override { return db_->IngestExternalFiles(args); } using DB::CreateColumnFamilyWithImport; - virtual Status CreateColumnFamilyWithImport( + Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, const ExportImportFilesMetaData& metadata, @@ -182,7 +170,7 @@ class StackableDB : public DB { import_options, metadata, handle); } - virtual Status CreateColumnFamilyWithImport( + Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, const std::vector& metadatas, @@ -192,9 +180,9 @@ class StackableDB : public DB { } using DB::ClipColumnFamily; - virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, - const Slice& begin_key, - const Slice& end_key) override { + Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override { return db_->ClipColumnFamily(column_family, begin_key, end_key); } @@ -203,24 +191,22 @@ class StackableDB : public DB { return db_->VerifyFileChecksums(read_opts); } - virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); } + Status VerifyChecksum() override { return db_->VerifyChecksum(); } - virtual Status VerifyChecksum(const ReadOptions& options) override { + Status VerifyChecksum(const ReadOptions& options) override { return db_->VerifyChecksum(options); } using DB::KeyMayExist; - virtual bool KeyMayExist(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - std::string* value, - bool* value_found = nullptr) override { + bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found = nullptr) override { return db_->KeyMayExist(options, column_family, key, value, value_found); } using DB::Delete; - virtual Status Delete(const WriteOptions& wopts, - ColumnFamilyHandle* column_family, - const Slice& key) override { + Status Delete(const WriteOptions& wopts, ColumnFamilyHandle* column_family, + const Slice& key) override { return db_->Delete(wopts, column_family, key); } Status Delete(const WriteOptions& wopts, ColumnFamilyHandle* column_family, @@ -229,9 +215,9 @@ class StackableDB : public DB { } using DB::SingleDelete; - virtual Status SingleDelete(const WriteOptions& wopts, - ColumnFamilyHandle* column_family, - const Slice& key) override { + Status SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override { return db_->SingleDelete(wopts, column_family, key); } Status SingleDelete(const WriteOptions& wopts, @@ -248,9 +234,8 @@ class StackableDB : public DB { } using DB::Merge; - virtual Status Merge(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override { + Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) override { return db_->Merge(options, column_family, key, value); } Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, @@ -258,20 +243,19 @@ class StackableDB : public DB { return db_->Merge(options, column_family, key, ts, value); } - virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override { + Status Write(const WriteOptions& opts, WriteBatch* updates) override { return db_->Write(opts, updates); } using DB::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& opts, - ColumnFamilyHandle* column_family) override { + Iterator* NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) override { return db_->NewIterator(opts, column_family); } - virtual Status NewIterators( - const ReadOptions& options, - const std::vector& column_families, - std::vector* iterators) override { + Status NewIterators(const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override { return db_->NewIterators(options, column_families, iterators); } @@ -281,8 +265,6 @@ class StackableDB : public DB { return db_->GetIteratorSequenceNumber(iterator); } - virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); } - // RocksDB-Cloud contribution begin Status GetSuperSnapshots( const std::vector& column_families, @@ -291,59 +273,65 @@ class StackableDB : public DB { } // RocksDB-Cloud contribution end - virtual void ReleaseSnapshot(const Snapshot* snapshot) override { + using DB::NewMultiCfIterator; + std::unique_ptr NewMultiCfIterator( + const ReadOptions& options, + const std::vector& column_families) override { + return db_->NewMultiCfIterator(options, column_families); + } + + const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); } + + void ReleaseSnapshot(const Snapshot* snapshot) override { return db_->ReleaseSnapshot(snapshot); } using DB::GetMapProperty; using DB::GetProperty; - virtual bool GetProperty(ColumnFamilyHandle* column_family, - const Slice& property, std::string* value) override { + bool GetProperty(ColumnFamilyHandle* column_family, const Slice& property, + std::string* value) override { return db_->GetProperty(column_family, property, value); } - virtual bool GetMapProperty( - ColumnFamilyHandle* column_family, const Slice& property, - std::map* value) override { + bool GetMapProperty(ColumnFamilyHandle* column_family, const Slice& property, + std::map* value) override { return db_->GetMapProperty(column_family, property, value); } using DB::GetIntProperty; - virtual bool GetIntProperty(ColumnFamilyHandle* column_family, - const Slice& property, uint64_t* value) override { + bool GetIntProperty(ColumnFamilyHandle* column_family, const Slice& property, + uint64_t* value) override { return db_->GetIntProperty(column_family, property, value); } using DB::GetAggregatedIntProperty; - virtual bool GetAggregatedIntProperty(const Slice& property, - uint64_t* value) override { + bool GetAggregatedIntProperty(const Slice& property, + uint64_t* value) override { return db_->GetAggregatedIntProperty(property, value); } using DB::GetApproximateSizes; - virtual Status GetApproximateSizes(const SizeApproximationOptions& options, - ColumnFamilyHandle* column_family, - const Range* r, int n, - uint64_t* sizes) override { + Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, const Range* r, + int n, uint64_t* sizes) override { return db_->GetApproximateSizes(options, column_family, r, n, sizes); } using DB::GetApproximateMemTableStats; - virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, - const Range& range, - uint64_t* const count, - uint64_t* const size) override { + void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, uint64_t* const count, + uint64_t* const size) override { return db_->GetApproximateMemTableStats(column_family, range, count, size); } using DB::CompactRange; - virtual Status CompactRange(const CompactRangeOptions& options, - ColumnFamilyHandle* column_family, - const Slice* begin, const Slice* end) override { + Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, const Slice* begin, + const Slice* end) override { return db_->CompactRange(options, column_family, begin, end); } using DB::CompactFiles; - virtual Status CompactFiles( + Status CompactFiles( const CompactionOptions& compact_options, ColumnFamilyHandle* column_family, const std::vector& input_file_names, const int output_level, @@ -355,111 +343,96 @@ class StackableDB : public DB { compaction_job_info); } - virtual Status PauseBackgroundWork() override { - return db_->PauseBackgroundWork(); - } - virtual Status ContinueBackgroundWork() override { + Status PauseBackgroundWork() override { return db_->PauseBackgroundWork(); } + Status ContinueBackgroundWork() override { return db_->ContinueBackgroundWork(); } - virtual Status EnableAutoCompaction( + Status EnableAutoCompaction( const std::vector& column_family_handles) override { return db_->EnableAutoCompaction(column_family_handles); } - virtual void EnableManualCompaction() override { + void EnableManualCompaction() override { return db_->EnableManualCompaction(); } - virtual void DisableManualCompaction() override { + void DisableManualCompaction() override { return db_->DisableManualCompaction(); } - virtual Status WaitForCompact( + Status WaitForCompact( const WaitForCompactOptions& wait_for_compact_options) override { return db_->WaitForCompact(wait_for_compact_options); } using DB::NumberLevels; - virtual int NumberLevels(ColumnFamilyHandle* column_family) override { + int NumberLevels(ColumnFamilyHandle* column_family) override { return db_->NumberLevels(column_family); } using DB::MaxMemCompactionLevel; - virtual int MaxMemCompactionLevel( - ColumnFamilyHandle* column_family) override { + int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override { return db_->MaxMemCompactionLevel(column_family); } using DB::Level0StopWriteTrigger; - virtual int Level0StopWriteTrigger( - ColumnFamilyHandle* column_family) override { + int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) override { return db_->Level0StopWriteTrigger(column_family); } - virtual const std::string& GetName() const override { return db_->GetName(); } + const std::string& GetName() const override { return db_->GetName(); } - virtual Env* GetEnv() const override { return db_->GetEnv(); } + Env* GetEnv() const override { return db_->GetEnv(); } - virtual FileSystem* GetFileSystem() const override { - return db_->GetFileSystem(); - } + FileSystem* GetFileSystem() const override { return db_->GetFileSystem(); } using DB::GetOptions; - virtual Options GetOptions(ColumnFamilyHandle* column_family) const override { + Options GetOptions(ColumnFamilyHandle* column_family) const override { return db_->GetOptions(column_family); } using DB::GetDBOptions; - virtual DBOptions GetDBOptions() const override { - return db_->GetDBOptions(); - } + DBOptions GetDBOptions() const override { return db_->GetDBOptions(); } using DB::Flush; - virtual Status Flush(const FlushOptions& fopts, - ColumnFamilyHandle* column_family) override { + Status Flush(const FlushOptions& fopts, + ColumnFamilyHandle* column_family) override { return db_->Flush(fopts, column_family); } - virtual Status Flush( + Status Flush( const FlushOptions& fopts, const std::vector& column_families) override { return db_->Flush(fopts, column_families); } - virtual Status SyncWAL() override { return db_->SyncWAL(); } + Status SyncWAL() override { return db_->SyncWAL(); } - virtual Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); } + Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); } - virtual Status LockWAL() override { return db_->LockWAL(); } + Status LockWAL() override { return db_->LockWAL(); } - virtual Status UnlockWAL() override { return db_->UnlockWAL(); } + Status UnlockWAL() override { return db_->UnlockWAL(); } + Status DisableFileDeletions() override { return db_->DisableFileDeletions(); } - virtual Status DisableFileDeletions() override { - return db_->DisableFileDeletions(); - } + Status EnableFileDeletions() override { return db_->EnableFileDeletions(); } - virtual Status EnableFileDeletions(bool force) override { - return db_->EnableFileDeletions(force); - } - - virtual void GetLiveFilesMetaData( - std::vector* metadata) override { + void GetLiveFilesMetaData(std::vector* metadata) override { db_->GetLiveFilesMetaData(metadata); } - virtual Status GetLiveFilesChecksumInfo( - FileChecksumList* checksum_list) override { + Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) override { return db_->GetLiveFilesChecksumInfo(checksum_list); } - virtual Status GetLiveFilesStorageInfo( + Status GetLiveFilesStorageInfo( const LiveFilesStorageInfoOptions& opts, std::vector* files) override { return db_->GetLiveFilesStorageInfo(opts, files); } - virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, - ColumnFamilyMetaData* cf_meta) override { + void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* cf_meta) override { db_->GetColumnFamilyMetaData(column_family, cf_meta); } @@ -504,13 +477,12 @@ class StackableDB : public DB { return db_->NewDefaultReplayer(handles, std::move(reader), replayer); } - - virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, - bool flush_memtable = true) override { + Status GetLiveFiles(std::vector& vec, uint64_t* mfs, + bool flush_memtable = true) override { return db_->GetLiveFiles(vec, mfs, flush_memtable); } - virtual SequenceNumber GetLatestSequenceNumber() const override { + SequenceNumber GetLatestSequenceNumber() const override { return db_->GetLatestSequenceNumber(); } @@ -524,16 +496,16 @@ class StackableDB : public DB { return db_->GetFullHistoryTsLow(column_family, ts_low); } - virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + Status GetSortedWalFiles(VectorLogPtr& files) override { return db_->GetSortedWalFiles(files); } - virtual Status GetCurrentWalFile( + Status GetCurrentWalFile( std::unique_ptr* current_log_file) override { return db_->GetCurrentWalFile(current_log_file); } - virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override { + Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override { return db_->GetCreationTimeOfOldestFile(creation_time); } @@ -544,15 +516,13 @@ class StackableDB : public DB { // do not plan to maintain it, the contract will likely remain underspecified // until its removal. Any user is encouraged to read the implementation // carefully and migrate away from it when possible. - virtual Status DeleteFile(std::string name) override { - return db_->DeleteFile(name); - } + Status DeleteFile(std::string name) override { return db_->DeleteFile(name); } - virtual Status GetDbIdentity(std::string& identity) const override { + Status GetDbIdentity(std::string& identity) const override { return db_->GetDbIdentity(identity); } - virtual Status GetDbSessionId(std::string& session_id) const override { + Status GetDbSessionId(std::string& session_id) const override { return db_->GetDbSessionId(session_id); } @@ -579,53 +549,50 @@ class StackableDB : public DB { } using DB::SetOptions; - virtual Status SetOptions(ColumnFamilyHandle* column_family_handle, - const std::unordered_map& - new_options) override { + Status SetOptions(ColumnFamilyHandle* column_family_handle, + const std::unordered_map& + new_options) override { return db_->SetOptions(column_family_handle, new_options); } - virtual Status SetDBOptions( - const std::unordered_map& new_options) - override { + Status SetDBOptions(const std::unordered_map& + new_options) override { return db_->SetDBOptions(new_options); } using DB::ResetStats; - virtual Status ResetStats() override { return db_->ResetStats(); } + Status ResetStats() override { return db_->ResetStats(); } using DB::GetPropertiesOfAllTables; - virtual Status GetPropertiesOfAllTables( - ColumnFamilyHandle* column_family, - TablePropertiesCollection* props) override { + Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override { return db_->GetPropertiesOfAllTables(column_family, props); } using DB::GetPropertiesOfTablesInRange; - virtual Status GetPropertiesOfTablesInRange( + Status GetPropertiesOfTablesInRange( ColumnFamilyHandle* column_family, const Range* range, std::size_t n, TablePropertiesCollection* props) override { return db_->GetPropertiesOfTablesInRange(column_family, range, n, props); } - virtual Status GetUpdatesSince( + Status GetUpdatesSince( SequenceNumber seq_number, std::unique_ptr* iter, const TransactionLogIterator::ReadOptions& read_options) override { return db_->GetUpdatesSince(seq_number, iter, read_options); } - virtual Status SuggestCompactRange(ColumnFamilyHandle* column_family, - const Slice* begin, - const Slice* end) override { + Status SuggestCompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) override { return db_->SuggestCompactRange(column_family, begin, end); } - virtual Status PromoteL0(ColumnFamilyHandle* column_family, - int target_level) override { + Status PromoteL0(ColumnFamilyHandle* column_family, + int target_level) override { return db_->PromoteL0(column_family, target_level); } - virtual ColumnFamilyHandle* DefaultColumnFamily() const override { + ColumnFamilyHandle* DefaultColumnFamily() const override { return db_->DefaultColumnFamily(); } @@ -642,7 +609,7 @@ class StackableDB : public DB { return db_->TryCatchUpWithPrimary(); } - virtual Status Resume() override { return db_->Resume(); } + Status Resume() override { return db_->Resume(); } protected: DB* db_; diff --git a/include/rocksdb/utilities/table_properties_collectors.h b/include/rocksdb/utilities/table_properties_collectors.h index f9d8d5dcdd7..064ce32f4a4 100644 --- a/include/rocksdb/utilities/table_properties_collectors.h +++ b/include/rocksdb/utilities/table_properties_collectors.h @@ -80,9 +80,8 @@ class CompactOnDeletionCollectorFactory // the specified number for "D" will not be changed. // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction // based on deletion ratio. Disabled by default. -extern std::shared_ptr +std::shared_ptr NewCompactOnDeletionCollectorFactory(size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio = 0); } // namespace ROCKSDB_NAMESPACE - diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h index e6452056a00..7625a3e38a4 100644 --- a/include/rocksdb/utilities/transaction.h +++ b/include/rocksdb/utilities/transaction.h @@ -417,6 +417,23 @@ class Transaction { std::string* value, bool exclusive = true, const bool do_validate = true) = 0; + // An overload of the above method that receives a PinnableSlice + // For backward compatibility a default implementation is provided + virtual Status GetForUpdate(const ReadOptions& options, const Slice& key, + PinnableSlice* pinnable_val, + bool exclusive = true, + const bool do_validate = true) { + if (pinnable_val == nullptr) { + std::string* null_str = nullptr; + return GetForUpdate(options, key, null_str, exclusive, do_validate); + } else { + auto s = GetForUpdate(options, key, pinnable_val->GetSelf(), exclusive, + do_validate); + pinnable_val->PinSelf(); + return s; + } + } + virtual std::vector MultiGetForUpdate( const ReadOptions& options, const std::vector& column_family, diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index 3c4b63068e6..3abd370b612 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -135,7 +135,7 @@ class RangeLockManagerHandle : public LockManagerHandle { virtual std::vector GetRangeDeadlockInfoBuffer() = 0; virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0; - virtual ~RangeLockManagerHandle() {} + ~RangeLockManagerHandle() override {} }; // A factory function to create a Range Lock Manager. The created object should @@ -390,8 +390,8 @@ class TransactionDB : public StackableDB { // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must // additionally be set. using StackableDB::DeleteRange; - virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, - const Slice&, const Slice&) override { + Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, const Slice&, + const Slice&) override { return Status::NotSupported(); } // Open a TransactionDB similar to DB::Open(). @@ -503,4 +503,3 @@ class TransactionDB : public StackableDB { }; } // namespace ROCKSDB_NAMESPACE - diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h index e0536712c4a..3fa168d0f9f 100644 --- a/include/rocksdb/utilities/write_batch_with_index.h +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -10,7 +10,6 @@ // inserted. #pragma once - #include #include #include @@ -30,6 +29,7 @@ class DB; class ReadCallback; struct ReadOptions; struct DBOptions; +class MergeContext; enum WriteType { kPutRecord, @@ -39,11 +39,12 @@ enum WriteType { kDeleteRangeRecord, kLogDataRecord, kXIDRecord, + kPutEntityRecord, kUnknownRecord, }; -// an entry for Put, Merge, Delete, or SingleDelete entry for write batches. -// Used in WBWIIterator. +// An entry for Put, PutEntity, Merge, Delete, or SingleDelete for write +// batches. Used in WBWIIterator. struct WriteEntry { WriteType type = kUnknownRecord; Slice key; @@ -77,12 +78,11 @@ class WBWIIterator { }; // A WriteBatchWithIndex with a binary searchable index built for all the keys -// inserted. -// In Put(), Merge() Delete(), or SingleDelete(), the same function of the -// wrapped will be called. At the same time, indexes will be built. -// By calling GetWriteBatch(), a user will get the WriteBatch for the data -// they inserted, which can be used for DB::Write(). -// A user can call NewIterator() to create an iterator. +// inserted. In Put(), PutEntity(), Merge(), Delete(), or SingleDelete(), the +// corresponding function of the wrapped WriteBatch will be called. At the same +// time, indexes will be built. By calling GetWriteBatch(), a user will get the +// WriteBatch for the data they inserted, which can be used for DB::Write(). A +// user can call NewIterator() to create an iterator. class WriteBatchWithIndex : public WriteBatchBase { public: // backup_index_comparator: the backup comparator used to compare keys @@ -121,14 +121,7 @@ class WriteBatchWithIndex : public WriteBatchBase { } Status PutEntity(ColumnFamilyHandle* column_family, const Slice& /* key */, - const WideColumns& /* columns */) override { - if (!column_family) { - return Status::InvalidArgument( - "Cannot call this method without a column family handle"); - } - return Status::NotSupported( - "PutEntity not supported by WriteBatchWithIndex"); - } + const WideColumns& /* columns */) override; Status PutEntity(const Slice& /* key */, const AttributeGroups& attribute_groups) override { @@ -236,6 +229,19 @@ class WriteBatchWithIndex : public WriteBatchBase { return GetFromBatch(nullptr, options, key, value); } + // If the batch contains an entry for "key" in "column_family", return it as a + // wide-column entity in "*columns". If the entry is a wide-column entity, + // return it as-is; if it is a plain key-value, return it as an entity with a + // single anonymous column (see kDefaultWideColumnName) which contains the + // value. + // + // Returns OK on success, NotFound if the there is no mapping for "key", + // MergeInProgress if the key has merge operands but the base value cannot be + // resolved based on the batch, or some error status (e.g. Corruption + // or InvalidArgument) on failure. + Status GetEntityFromBatch(ColumnFamilyHandle* column_family, const Slice& key, + PinnableWideColumns* columns); + // Similar to DB::Get() but will also read writes from this batch. // // This function will query both this batch and the DB and then merge @@ -262,21 +268,24 @@ class WriteBatchWithIndex : public WriteBatchBase { ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value); + // TODO: implement GetEntityFromBatchAndDB + void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, bool sorted_input); + // TODO: implement MultiGetEntityFromBatchAndDB + // Records the state of the batch for future calls to RollbackToSavePoint(). // May be called multiple times to set multiple save points. void SetSavePoint() override; - // Remove all entries in this batch (Put, Merge, Delete, SingleDelete, - // PutLogData) since the most recent call to SetSavePoint() and removes the - // most recent save point. - // If there is no previous call to SetSavePoint(), behaves the same as - // Clear(). + // Remove all entries in this batch (Put, PutEntity, Merge, Delete, + // SingleDelete, PutLogData) since the most recent call to SetSavePoint() and + // removes the most recent save point. If there is no previous call to + // SetSavePoint(), behaves the same as Clear(). // // Calling RollbackToSavePoint invalidates any open iterators on this batch. // @@ -305,6 +314,11 @@ class WriteBatchWithIndex : public WriteBatchBase { // last sub-batch. size_t SubBatchCnt(); + void MergeAcrossBatchAndDB(ColumnFamilyHandle* column_family, + const Slice& key, + const PinnableWideColumns& existing, + const MergeContext& merge_context, + PinnableSlice* value, Status* status); Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value, ReadCallback* callback); diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index b1ab4f46084..32bae6700ad 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -11,8 +11,8 @@ // NOTE: in 'main' development branch, this should be the *next* // minor or major version number planned for release. -#define ROCKSDB_MAJOR 8 -#define ROCKSDB_MINOR 9 +#define ROCKSDB_MAJOR 9 +#define ROCKSDB_MINOR 1 #define ROCKSDB_PATCH 1 // Do not use these. We made the mistake of declaring macros starting with diff --git a/include/rocksdb/wal_filter.h b/include/rocksdb/wal_filter.h index 3e66c39e464..75e030d2fcf 100644 --- a/include/rocksdb/wal_filter.h +++ b/include/rocksdb/wal_filter.h @@ -105,7 +105,7 @@ class WalFilter : public Customizable { // Returns a name that identifies this WAL filter. // The name will be printed to LOG file on start up for diagnosis. - virtual const char* Name() const override = 0; + const char* Name() const override = 0; }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/wide_columns.h b/include/rocksdb/wide_columns.h index 35b81268bed..79763755102 100644 --- a/include/rocksdb/wide_columns.h +++ b/include/rocksdb/wide_columns.h @@ -238,9 +238,17 @@ class AttributeGroup { WideColumns columns_; }; +inline bool operator==(const AttributeGroup& lhs, const AttributeGroup& rhs) { + return lhs.column_family() == rhs.column_family() && + lhs.columns() == rhs.columns(); +} + // A collection of Attribute Groups. using AttributeGroups = std::vector; +// An empty set of Attribute Groups. +extern const AttributeGroups kNoAttributeGroups; + // Used in Read Path. Wide-columns returned from the query are pinnable. class PinnableAttributeGroup { public: diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index ee4402695e1..75f17a575e0 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -101,15 +101,12 @@ class WriteBatch : public WriteBatchBase { } using WriteBatchBase::TimedPut; - // DO NOT USE, UNDER CONSTRUCTION + // EXPERIMENTAL // Stores the mapping "key->value" in the database with the specified write - // time in the column family. - Status TimedPut(ColumnFamilyHandle* /* column_family */, - const Slice& /* key */, const Slice& /* value */, - uint64_t /* write_unix_time */) override { - // TODO(yuzhangyu): implement take in the write time. - return Status::NotSupported("TimedPut is under construction"); - } + // time in the column family. Also see documentation in + // `WriteBatchBase::TimedPut` for the API's usage and limitations. + Status TimedPut(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, uint64_t write_unix_time) override; // Store the mapping "key->{column1:value1, column2:value2, ...}" in the // column family specified by "column_family". @@ -259,6 +256,13 @@ class WriteBatch : public WriteBatchBase { // If user-defined timestamp is enabled, then `key` includes timestamp. virtual void Put(const Slice& /*key*/, const Slice& /*value*/) {} + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status TimedPutCF(uint32_t /*column_family_id*/, + const Slice& /*key*/, const Slice& /*value*/, + uint64_t /*write_time*/) { + return Status::InvalidArgument("TimedPutCF not implemented"); + } + // If user-defined timestamp is enabled, then `key` includes timestamp. virtual Status PutEntityCF(uint32_t /* column_family_id */, const Slice& /* key */, @@ -385,6 +389,9 @@ class WriteBatch : public WriteBatchBase { // Returns true if PutCF will be called during Iterate bool HasPut() const; + // Returns true if TimedPutCF will be called during Iterate + bool HasTimedPut() const; + // Returns true if PutEntityCF will be called during Iterate bool HasPutEntity() const; diff --git a/include/rocksdb/write_batch_base.h b/include/rocksdb/write_batch_base.h index 5b26ee543b5..2bf6a3c424a 100644 --- a/include/rocksdb/write_batch_base.h +++ b/include/rocksdb/write_batch_base.h @@ -42,6 +42,7 @@ class WriteBatchBase { const SliceParts& value); virtual Status Put(const SliceParts& key, const SliceParts& value); + // EXPERIMENTAL // Store the mapping "key->value" in the database with the specified write // time in the column family. Using some write time that is in the past to // fast track data to their correct placement and preservation is the intended @@ -49,10 +50,9 @@ class WriteBatchBase { // as having the given write time for this purpose but doesn't currently make // any guarantees. // - // When a regular Put("foo", "v1") is followed by a - // TimedPut("foo", "v2", some_time_before_first_put), the behavior of read - // queries are undefined and can change over time, for example due to - // compactions. + // This feature is experimental and one known side effect is that it can break + // snapshot immutability. Reading from a snapshot created before + // TimedPut(k, v, t) may or may not see that k->v. // Note: this feature is currently not compatible with user-defined timestamps // and wide columns. virtual Status TimedPut(ColumnFamilyHandle* column_family, const Slice& key, diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt index 0fc503e69ed..053e4a33335 100644 --- a/java/CMakeLists.txt +++ b/java/CMakeLists.txt @@ -46,6 +46,7 @@ set(JNI_NATIVE_SOURCES rocksjni/hyper_clock_cache.cc rocksjni/ingest_external_file_options.cc rocksjni/iterator.cc + rocksjni/jni_multiget_helpers.cc rocksjni/jnicallback.cc rocksjni/loggerjnicallback.cc rocksjni/lru_cache.cc @@ -74,9 +75,11 @@ set(JNI_NATIVE_SOURCES rocksjni/sst_partitioner.cc rocksjni/statistics.cc rocksjni/statisticsjni.cc + rocksjni/stderr_logger.cc rocksjni/table.cc rocksjni/table_filter.cc rocksjni/table_filter_jnicallback.cc + rocksjni/table_properties_collector_factory.cc rocksjni/testable_event_listener.cc rocksjni/thread_status.cc rocksjni/trace_writer.cc @@ -113,7 +116,6 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/AbstractTransactionNotifier.java src/main/java/org/rocksdb/AbstractWalFilter.java src/main/java/org/rocksdb/AbstractWriteBatch.java - src/main/java/org/rocksdb/AccessHint.java src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java src/main/java/org/rocksdb/BackgroundErrorReason.java @@ -168,6 +170,7 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/FlushJobInfo.java src/main/java/org/rocksdb/FlushReason.java src/main/java/org/rocksdb/FlushOptions.java + src/main/java/org/rocksdb/GetStatus.java src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java src/main/java/org/rocksdb/HashSkipListMemTableConfig.java src/main/java/org/rocksdb/HistogramData.java @@ -186,6 +189,8 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/LiveFileMetaData.java src/main/java/org/rocksdb/LogFile.java src/main/java/org/rocksdb/Logger.java + src/main/java/org/rocksdb/LoggerInterface.java + src/main/java/org/rocksdb/LoggerType.java src/main/java/org/rocksdb/LRUCache.java src/main/java/org/rocksdb/MemoryUsageType.java src/main/java/org/rocksdb/MemoryUtil.java @@ -256,6 +261,7 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/TableFileDeletionInfo.java src/main/java/org/rocksdb/TableFilter.java src/main/java/org/rocksdb/TableProperties.java + src/main/java/org/rocksdb/TablePropertiesCollectorFactory.java src/main/java/org/rocksdb/TableFormatConfig.java src/main/java/org/rocksdb/ThreadType.java src/main/java/org/rocksdb/ThreadStatus.java @@ -285,6 +291,7 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/WriteBufferManager.java src/main/java/org/rocksdb/WriteStallCondition.java src/main/java/org/rocksdb/WriteStallInfo.java + src/main/java/org/rocksdb/util/BufferUtil.java src/main/java/org/rocksdb/util/ByteUtil.java src/main/java/org/rocksdb/util/BytewiseComparator.java src/main/java/org/rocksdb/util/Environment.java @@ -298,6 +305,7 @@ set(JAVA_MAIN_CLASSES src/test/java/org/rocksdb/WriteBatchTest.java src/test/java/org/rocksdb/RocksNativeLibraryResource.java src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java + src/main/java/org/rocksdb/util/StdErrLogger.java src/test/java/org/rocksdb/util/WriteBatchGetter.java ) @@ -409,6 +417,7 @@ set(JAVA_TEST_CLASSES src/test/java/org/rocksdb/MemoryUtilTest.java src/test/java/org/rocksdb/TableFilterTest.java src/test/java/org/rocksdb/TtlDBTest.java + src/test/java/org/rocksdb/util/StdErrLoggerTest.java ) set(JAVA_TEST_RUNNING_CLASSES @@ -507,6 +516,7 @@ set(JAVA_TEST_RUNNING_CLASSES org.rocksdb.MemoryUtilTest org.rocksdb.TableFilterTest org.rocksdb.TtlDBTest + org.rocksdb.util.StdErrLoggerTest ) include(FindJava) @@ -785,13 +795,84 @@ if(NOT MINGW) ) endif() +# Javadoc Jar +set(ROCKSDB_JAVADOC_JAR rocksdbjni-${CMAKE_PROJECT_VERSION}-javadoc.jar) +create_javadoc(rocksdb + PACKAGES org.rocksdb org.rocksdb.util + SOURCEPATH "${PROJECT_SOURCE_DIR}/java/src/main/java" + WINDOWTITLE "RocksDB Java API JavaDoc" + AUTHOR FALSE + USE FALSE + VERSION TRUE +) +add_custom_target(rocksdb_javadocs_jar ALL + COMMAND ${Java_JAR_EXECUTABLE} cvf ${CMAKE_CURRENT_BINARY_DIR}/${ROCKSDB_JAVADOC_JAR} -C ${CMAKE_CURRENT_BINARY_DIR}/javadoc/rocksdb . + BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/${ROCKSDB_JAVADOC_JAR} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS rocksdb_javadoc +) + +# Sources Jar +set(ROCKSDB_SOURCES_JAR rocksdbjni-${CMAKE_PROJECT_VERSION}-sources.jar) +add_custom_target(rocksdb_sources_jar ALL + ${Java_JAR_EXECUTABLE} cvf ${CMAKE_CURRENT_BINARY_DIR}/${ROCKSDB_SOURCES_JAR} -C ${PROJECT_SOURCE_DIR}/java/src/main/java/ . + BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/${ROCKSDB_SOURCES_JAR} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} +) + +set(bitness 32) +if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(bitness 64) +endif() + +if(${CMAKE_SYSTEM_NAME} MATCHES "Windows") + set_target_properties( + ${ROCKSDBJNI_STATIC_LIB} + PROPERTIES + OUTPUT_NAME librocksdbjni-win${bitness} + ) + set(ROCKSDB_JAR rocksdbjni-${CMAKE_PROJECT_VERSION}-win${bitness}.jar) +elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux") + set_target_properties( + ${ROCKSDBJNI_STATIC_LIB} + PROPERTIES + OUTPUT_NAME "rocksdbjni-linux${bitness}" + ) + set(ROCKSDB_JAR rocksdbjni-${CMAKE_PROJECT_VERSION}-linux${bitness}.jar) +elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + set_target_properties( + ${ROCKSDBJNI_STATIC_LIB} + PROPERTIES + OUTPUT_NAME rocksdbjni-osx-${CMAKE_SYSTEM_PROCESSOR} + ) + set_target_properties( + ${ROCKSDBJNI_STATIC_LIB} + PROPERTIES + SUFFIX ".jnilib" + ) + set(ROCKSDB_JAR rocksdbjni-${CMAKE_PROJECT_VERSION}-osx.jar) +else() + set(ROCKSDB_JAR rocksdb-${CMAKE_PROJECT_VERSION}.jar) +endif() + +get_target_property(ROCKS_JAR_FILE rocksdbjni_classes JAR_FILE) + +add_custom_target(rocksdbjava ALL + COMMAND ${CMAKE_COMMAND} -E copy ${ROCKS_JAR_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${ROCKSDB_JAR} + COMMAND ${Java_JAR_EXECUTABLE} -uf ${CMAKE_CURRENT_BINARY_DIR}/${ROCKSDB_JAR} -C $ $ + COMMAND ${Java_JAR_EXECUTABLE} -uf ${CMAKE_CURRENT_BINARY_DIR}/${ROCKSDB_JAR} -C ${CMAKE_CURRENT_SOURCE_DIR} HISTORY-JAVA.md + DEPENDS ${ROCKSDBJNI_STATIC_LIB} rocksdbjni_classes + BYPRODUCTS ${ROCKSDB_JAR} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + enable_testing() get_target_property(ROCKSDBJNI_CLASSES_TEST_JAR_FILE rocksdbjni_test_classes JAR_FILE) foreach (CLAZZ ${JAVA_TEST_RUNNING_CLASSES}) if(${CMAKE_SYSTEM_NAME} MATCHES "Windows") add_test( NAME jtest_${CLAZZ} - COMMAND ${Java_JAVA_EXECUTABLE} ${JVMARGS} -ea -Xcheck:jni -Djava.library.path=${PROJECT_BINARY_DIR}/java/${CMAKE_BUILD_TYPE} -classpath ${JAVA_RUN_TESTCLASSPATH}$${ROCKSDBJNI_CLASSES_TEST_JAR_FILE} org.rocksdb.test.RocksJunitRunner ${CLAZZ} + COMMAND ${Java_JAVA_EXECUTABLE} ${JVMARGS} -ea -Xcheck:jni -Djava.library.path=${PROJECT_BINARY_DIR}/java/${CMAKE_BUILD_TYPE} -classpath ${CMAKE_CURRENT_BINARY_DIR}/${ROCKSDB_JAR}$${JAVA_RUN_TESTCLASSPATH}$${ROCKSDBJNI_CLASSES_TEST_JAR_FILE} org.rocksdb.test.RocksJunitRunner ${CLAZZ} ) else() add_test( @@ -799,4 +880,4 @@ foreach (CLAZZ ${JAVA_TEST_RUNNING_CLASSES}) COMMAND ${Java_JAVA_EXECUTABLE} ${JVMARGS} -ea -Xcheck:jni -Djava.library.path=${PROJECT_BINARY_DIR}/java -classpath ${JAVA_RUN_TESTCLASSPATH}:${ROCKSDBJNI_CLASSES_TEST_JAR_FILE} org.rocksdb.test.RocksJunitRunner ${CLAZZ} ) endif() -endforeach(CLAZZ) \ No newline at end of file +endforeach(CLAZZ) diff --git a/java/GetBenchmarks.md b/java/GetPutBenchmarks.md similarity index 66% rename from java/GetBenchmarks.md rename to java/GetPutBenchmarks.md index b66a897e212..600b6377c10 100644 --- a/java/GetBenchmarks.md +++ b/java/GetPutBenchmarks.md @@ -8,16 +8,16 @@ Mac ``` make clean jclean DEBUG_LEVEL=0 make -j12 rocksdbjava -(cd java/target; cp rocksdbjni-7.9.0-osx.jar rocksdbjni-7.9.0-SNAPSHOT-osx.jar) -mvn install:install-file -Dfile=./java/target/rocksdbjni-7.9.0-SNAPSHOT-osx.jar -DgroupId=org.rocksdb -DartifactId=rocksdbjni -Dversion=7.9.0-SNAPSHOT -Dpackaging=jar +(cd java/target; cp rocksdbjni-7.10.0-osx.jar rocksdbjni-7.10.0-SNAPSHOT-osx.jar) +mvn install:install-file -Dfile=./java/target/rocksdbjni-7.10.0-SNAPSHOT-osx.jar -DgroupId=org.rocksdb -DartifactId=rocksdbjni -Dversion=7.10.0-SNAPSHOT -Dpackaging=jar ``` Linux ``` make clean jclean DEBUG_LEVEL=0 make -j12 rocksdbjava -(cd java/target; cp rocksdbjni-7.9.0-linux64.jar rocksdbjni-7.9.0-SNAPSHOT-linux64.jar) -mvn install:install-file -Dfile=./java/target/rocksdbjni-7.9.0-SNAPSHOT-linux64.jar -DgroupId=org.rocksdb -DartifactId=rocksdbjni -Dversion=7.9.0-SNAPSHOT -Dpackaging=jar +(cd java/target; cp rocksdbjni-7.10.0-linux64.jar rocksdbjni-7.10.0-SNAPSHOT-linux64.jar) +mvn install:install-file -Dfile=./java/target/rocksdbjni-7.10.0-SNAPSHOT-linux64.jar -DgroupId=org.rocksdb -DartifactId=rocksdbjni -Dversion=7.10.0-SNAPSHOT -Dpackaging=jar ``` Build jmh test package, on either platform @@ -35,31 +35,10 @@ The long performance run (as big as we can make it on our Ubuntu box without fil java -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar -p keyCount=1000,50000 -p keySize=128 -p valueSize=1024,16384 -p columnFamilyTestType="1_column_family","20_column_families" GetBenchmarks.get GetBenchmarks.preallocatedByteBufferGet GetBenchmarks.preallocatedGet ``` -## Results (small runs, Mac) - -These are run on a 10-core M1 with 64GB of memory and 2TB of SSD. -They probably reflect the absolute best case for this optimization, hitting in-memory buffers and completely eliminating a buffer copy. - -### Before -Benchmark (columnFamilyTestType) (keyCount) (keySize) (multiGetSize) (valueSize) Mode Cnt Score Error Units -GetBenchmarks.get no_column_family 1000 128 N/A 32768 thrpt 25 43496.578 ± 5743.090 ops/s -GetBenchmarks.preallocatedByteBufferGet no_column_family 1000 128 N/A 32768 thrpt 25 70765.578 ± 697.548 ops/s -GetBenchmarks.preallocatedGet no_column_family 1000 128 N/A 32768 thrpt 25 69883.554 ± 944.184 ops/s - -### After fixing byte[] (.get and .preallocatedGet) - -Benchmark (columnFamilyTestType) (keyCount) (keySize) (multiGetSize) (valueSize) Mode Cnt Score Error Units -GetBenchmarks.get no_column_family 1000 128 N/A 32768 thrpt 25 149207.681 ± 2261.671 ops/s -GetBenchmarks.preallocatedByteBufferGet no_column_family 1000 128 N/A 32768 thrpt 25 68920.489 ± 1574.664 ops/s -GetBenchmarks.preallocatedGet no_column_family 1000 128 N/A 32768 thrpt 25 177399.022 ± 2107.375 ops/s +## Results (Ubuntu, big runs) -### After fixing ByteBuffer (.preallocatedByteBufferGet) +NB - we have removed some test results we initially observed on Mac which were not later reproducible. -Benchmark (columnFamilyTestType) (keyCount) (keySize) (multiGetSize) (valueSize) Mode Cnt Score Error Units -GetBenchmarks.get no_column_family 1000 128 N/A 32768 thrpt 25 150389.259 ± 1371.473 ops/s -GetBenchmarks.preallocatedByteBufferGet no_column_family 1000 128 N/A 32768 thrpt 25 179919.468 ± 1670.714 ops/s -GetBenchmarks.preallocatedGet no_column_family 1000 128 N/A 32768 thrpt 25 178261.938 ± 2630.571 ops/s -## Results (Ubuntu, big runs) These take 3-4 hours ``` java -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar -p keyCount=1000,50000 -p keySize=128 -p valueSize=1024,16384 -p columnFamilyTestType="1_column_family","20_column_families" GetBenchmarks.get GetBenchmarks.preallocatedByteBufferGet GetBenchmarks.preallocatedGet @@ -67,6 +46,13 @@ java -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar -p keyCount=1000,500 It's clear that all `get()` variants have noticeably improved performance, though not the spectacular gains of the M1. ### With fixes for all of the `get()` instances +The tests which use methods which have had performance improvements applied are: +```java +get() +preallocatedGet() +preallocatedByteBufferGet() +``` + Benchmark (columnFamilyTestType) (keyCount) (keySize) (valueSize) Mode Cnt Score Error Units GetBenchmarks.get 1_column_family 1000 128 1024 thrpt 25 935648.793 ± 22879.910 ops/s GetBenchmarks.get 1_column_family 1000 128 16384 thrpt 25 204366.301 ± 1326.570 ops/s @@ -159,3 +145,60 @@ GetBenchmarks.preallocatedGet no_column_families 1000 The performance improvement is real. +# Put Performance Benchmarks + +Results associated with [Java API consistency between RocksDB.put() , .merge() and Transaction.put() , .merge()](https://github.com/facebook/rocksdb/pull/11019) + +This work was not designed specifically as a performance optimization, but we want to confirm that it has not regressed what it has changed, and to provide +a baseline for future possible performance work. + +## Build/Run + +Building is as above. Running is a different invocation of the same JMH jar. +``` +java -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar -p keyCount=1000,50000 -p keySize=128 -p valueSize=1024,32768 -p columnFamilyTestType="no_column_family" PutBenchmarks +``` + +## Before Changes + +These results were generated in a private branch with the `PutBenchmarks` from the PR backported onto the current *main*. + +Benchmark (bufferListSize) (columnFamilyTestType) (keyCount) (keySize) (valueSize) Mode Cnt Score Error Units +PutBenchmarks.put 16 no_column_family 1000 128 1024 thrpt 25 76670.200 ± 2555.248 ops/s +PutBenchmarks.put 16 no_column_family 1000 128 32768 thrpt 25 3913.692 ± 225.690 ops/s +PutBenchmarks.put 16 no_column_family 50000 128 1024 thrpt 25 74479.589 ± 988.361 ops/s +PutBenchmarks.put 16 no_column_family 50000 128 32768 thrpt 25 4070.800 ± 194.838 ops/s +PutBenchmarks.putByteArrays 16 no_column_family 1000 128 1024 thrpt 25 72150.853 ± 1744.216 ops/s +PutBenchmarks.putByteArrays 16 no_column_family 1000 128 32768 thrpt 25 3896.646 ± 188.629 ops/s +PutBenchmarks.putByteArrays 16 no_column_family 50000 128 1024 thrpt 25 71753.287 ± 1053.904 ops/s +PutBenchmarks.putByteArrays 16 no_column_family 50000 128 32768 thrpt 25 3928.503 ± 264.443 ops/s +PutBenchmarks.putByteBuffers 16 no_column_family 1000 128 1024 thrpt 25 72595.105 ± 1027.258 ops/s +PutBenchmarks.putByteBuffers 16 no_column_family 1000 128 32768 thrpt 25 3890.100 ± 199.131 ops/s +PutBenchmarks.putByteBuffers 16 no_column_family 50000 128 1024 thrpt 25 70878.133 ± 1181.601 ops/s +PutBenchmarks.putByteBuffers 16 no_column_family 50000 128 32768 thrpt 25 3863.181 ± 215.888 ops/s + +## After Changes + +These results were generated on the PR branch. + +Benchmark (bufferListSize) (columnFamilyTestType) (keyCount) (keySize) (valueSize) Mode Cnt Score Error Units +PutBenchmarks.put 16 no_column_family 1000 128 1024 thrpt 25 75178.751 ± 2644.775 ops/s +PutBenchmarks.put 16 no_column_family 1000 128 32768 thrpt 25 3937.175 ± 257.039 ops/s +PutBenchmarks.put 16 no_column_family 50000 128 1024 thrpt 25 74375.519 ± 1776.654 ops/s +PutBenchmarks.put 16 no_column_family 50000 128 32768 thrpt 25 4013.413 ± 257.706 ops/s +PutBenchmarks.putByteArrays 16 no_column_family 1000 128 1024 thrpt 25 71418.303 ± 1610.977 ops/s +PutBenchmarks.putByteArrays 16 no_column_family 1000 128 32768 thrpt 25 4027.581 ± 227.900 ops/s +PutBenchmarks.putByteArrays 16 no_column_family 50000 128 1024 thrpt 25 71229.107 ± 2720.083 ops/s +PutBenchmarks.putByteArrays 16 no_column_family 50000 128 32768 thrpt 25 4022.635 ± 212.540 ops/s +PutBenchmarks.putByteBuffers 16 no_column_family 1000 128 1024 thrpt 25 71718.501 ± 787.537 ops/s +PutBenchmarks.putByteBuffers 16 no_column_family 1000 128 32768 thrpt 25 4078.050 ± 176.331 ops/s +PutBenchmarks.putByteBuffers 16 no_column_family 50000 128 1024 thrpt 25 72736.754 ± 828.971 ops/s +PutBenchmarks.putByteBuffers 16 no_column_family 50000 128 32768 thrpt 25 3987.232 ± 205.577 ops/s + +## Discussion + +The changes don't appear to have had a material effect on performance. We are happy with this. + + * We would obviously advise running future changes before and after to confirm they have no adverse effects. + + diff --git a/java/Makefile b/java/Makefile index e71589e9e18..5e00921c62b 100644 --- a/java/Makefile +++ b/java/Makefile @@ -89,7 +89,8 @@ NATIVE_JAVA_CLASSES = \ org.rocksdb.WriteOptions\ org.rocksdb.WriteBatchWithIndex\ org.rocksdb.WriteBufferManager\ - org.rocksdb.WBWIRocksIterator + org.rocksdb.WBWIRocksIterator\ + org.rocksdb.util.StdErrLogger NATIVE_JAVA_TEST_CLASSES = \ org.rocksdb.RocksDBExceptionTest\ @@ -150,7 +151,9 @@ JAVA_TESTS = \ org.rocksdb.LRUCacheTest\ org.rocksdb.MemoryUtilTest\ org.rocksdb.MemTableTest\ + org.rocksdb.MergeCFVariantsTest\ org.rocksdb.MergeTest\ + org.rocksdb.MergeVariantsTest\ org.rocksdb.MultiColumnRegressionTest \ org.rocksdb.MultiGetManyKeysTest\ org.rocksdb.MultiGetTest\ @@ -167,6 +170,8 @@ JAVA_TESTS = \ org.rocksdb.OptionsTest\ org.rocksdb.PerfLevelTest \ org.rocksdb.PerfContextTest \ + org.rocksdb.PutCFVariantsTest\ + org.rocksdb.PutVariantsTest\ org.rocksdb.PlainTableConfigTest\ org.rocksdb.RateLimiterTest\ org.rocksdb.ReadOnlyTest\ @@ -202,7 +207,8 @@ JAVA_TESTS = \ org.rocksdb.WriteBatchTest\ org.rocksdb.WriteBatchThreadedTest\ org.rocksdb.WriteOptionsTest\ - org.rocksdb.WriteBatchWithIndexTest + org.rocksdb.WriteBatchWithIndexTest\ + org.rocksdb.util.StdErrLoggerTest MAIN_SRC = src/main/java TEST_SRC = src/test/java diff --git a/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java index 070f0fe7581..8673b35fca4 100644 --- a/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java +++ b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java @@ -614,9 +614,6 @@ private void prepareOptions(Options options) throws RocksDBException { (Integer)flags_.get(Flag.universal_compression_size_percent)); // TODO(yhchiang): add RocksDB.openForReadOnly() to enable Flag.readonly // TODO(yhchiang): enable Flag.merge_operator by switch - options.setAccessHintOnCompactionStart( - (String)flags_.get(Flag.compaction_fadvice)); - // available values of fadvice are "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" for fadvice */ } diff --git a/java/crossbuild/docker-build-linux-alpine.sh b/java/crossbuild/docker-build-linux-alpine.sh deleted file mode 100755 index e3e852efeaf..00000000000 --- a/java/crossbuild/docker-build-linux-alpine.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. - -set -e -#set -x - -# just in-case this is run outside Docker -mkdir -p /rocksdb-local-build - -rm -rf /rocksdb-local-build/* -cp -r /rocksdb-host/* /rocksdb-local-build -cd /rocksdb-local-build - -make clean-not-downloaded -PORTABLE=1 make -j2 rocksdbjavastatic - -cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar java/target/rocksdbjni-*-linux*.jar.sha1 /rocksdb-java-target diff --git a/java/crossbuild/docker-build-linux-centos.sh b/java/crossbuild/docker-build-linux.sh similarity index 66% rename from java/crossbuild/docker-build-linux-centos.sh rename to java/crossbuild/docker-build-linux.sh index 16581dec74e..4996696eac7 100755 --- a/java/crossbuild/docker-build-linux-centos.sh +++ b/java/crossbuild/docker-build-linux.sh @@ -4,6 +4,11 @@ set -e #set -x +# Set job parallelism to 1 (none) if it is not defined in the environment +if [ -z "${J}" ]; then + J=1 +fi + # just in-case this is run outside Docker mkdir -p /rocksdb-local-build @@ -14,24 +19,24 @@ cd /rocksdb-local-build # Use scl devtoolset if available if hash scl 2>/dev/null; then if scl --list | grep -q 'devtoolset-8'; then - # CentOS 6+ - scl enable devtoolset-8 'make clean-not-downloaded' - scl enable devtoolset-8 'PORTABLE=1 make -j2 rocksdbjavastatic' + # CentOS 6+ + scl enable devtoolset-8 'make clean-not-downloaded' + scl enable devtoolset-8 "PORTABLE=1 J=$J make -j$J rocksdbjavastatic" elif scl --list | grep -q 'devtoolset-7'; then # CentOS 6+ scl enable devtoolset-7 'make clean-not-downloaded' - scl enable devtoolset-7 'PORTABLE=1 make -j2 rocksdbjavastatic' + scl enable devtoolset-7 "PORTABLE=1 J=$J make -j$J rocksdbjavastatic" elif scl --list | grep -q 'devtoolset-2'; then # CentOS 5 or 6 scl enable devtoolset-2 'make clean-not-downloaded' - scl enable devtoolset-2 'PORTABLE=1 make -j2 rocksdbjavastatic' + scl enable devtoolset-2 "PORTABLE=1 J=$J make -j$J rocksdbjavastatic" else echo "Could not find devtoolset" exit 1; fi else make clean-not-downloaded - PORTABLE=1 make -j2 rocksdbjavastatic + PORTABLE=1 make -j$J rocksdbjavastatic fi cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar java/target/rocksdbjni-*-linux*.jar.sha1 /rocksdb-java-target diff --git a/java/jmh/README.md b/java/jmh/README.md index 1575ab51776..d9ba1e60ba5 100644 --- a/java/jmh/README.md +++ b/java/jmh/README.md @@ -6,10 +6,10 @@ These are micro-benchmarks for RocksJava functionality, using [JMH (Java Microbe **Note**: This uses a specific build of RocksDB that is set in the `` element of the `dependencies` section of the `pom.xml` file. If you are testing local changes you should build and install a SNAPSHOT version of rocksdbjni, and update the `pom.xml` of rocksdbjni-jmh file to test with this. -For instance, this is how to install the OSX jar you just built for 6.26.0 +For instance, this is how to install the OSX jar you just built for 8.11.0 ```bash -$ mvn install:install-file -Dfile=./java/target/rocksdbjni-6.26.0-SNAPSHOT-osx.jar -DgroupId=org.rocksdb -DartifactId=rocksdbjni -Dversion=6.26.0-SNAPSHOT -Dpackaging=jar +$ mvn install:install-file -Dfile=./java/target/rocksdbjni-8.11.0-SNAPSHOT-osx.jar -DgroupId=org.rocksdb -DartifactId=rocksdbjni -Dversion=8.11.0-SNAPSHOT -Dpackaging=jar ``` ```bash diff --git a/java/jmh/pom.xml b/java/jmh/pom.xml index 3016aefa788..8400e1e67ee 100644 --- a/java/jmh/pom.xml +++ b/java/jmh/pom.xml @@ -38,8 +38,8 @@ - 1.7 - 1.7 + 17 + 17 UTF-8 1.22 @@ -50,7 +50,7 @@ org.rocksdb rocksdbjni - 7.9.0-SNAPSHOT + 9.0.0 diff --git a/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java b/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java index d374477160e..a8985652ffe 100644 --- a/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java +++ b/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java @@ -55,6 +55,9 @@ public class MultiGetBenchmarks { RocksDB db; private final AtomicInteger keyIndex = new AtomicInteger(); + private List defaultCFHandles = new ArrayList<>(); + private List randomCFHandles = new ArrayList<>(); + @Setup(Level.Trial) public void setup() throws IOException, RocksDBException { RocksDB.loadLibrary(); @@ -88,6 +91,12 @@ public void setup() throws IOException, RocksDBException { cfHandles = cfHandlesList.toArray(new ColumnFamilyHandle[0]); // store initial data for retrieving via get + for (int j = 0; j < keyCount; j++) { + final byte[] paddedValue = Arrays.copyOf(ba("value" + j), valueSize); + db.put(ba("key" + j), paddedValue); + } + + // store initial data for retrieving via get - column families for (int i = 0; i < cfs; i++) { for (int j = 0; j < keyCount; j++) { final byte[] paddedValue = Arrays.copyOf(ba("value" + j), valueSize); @@ -95,6 +104,17 @@ public void setup() throws IOException, RocksDBException { } } + // build a big list of default column families for efficient passing + final ColumnFamilyHandle defaultCFH = db.getDefaultColumnFamily(); + for (int i = 0; i < keyCount; i++) { + defaultCFHandles.add(defaultCFH); + } + + // list of random cfs + for (int i = 0; i < keyCount; i++) { + randomCFHandles.add(cfHandlesList.get((int) (Math.random() * cfs))); + } + try (final FlushOptions flushOptions = new FlushOptions() .setWaitForFlush(true)) { db.flush(flushOptions); @@ -163,15 +183,13 @@ private int next(final int inc, final int limit) { @Setup public void allocateSliceBuffers() { - keysBuffer = ByteBuffer.allocateDirect(keyCount * valueSize); + keysBuffer = ByteBuffer.allocateDirect(keyCount * keySize); valuesBuffer = ByteBuffer.allocateDirect(keyCount * valueSize); valueBuffersList = new ArrayList<>(); keyBuffersList = new ArrayList<>(); for (int i = 0; i < keyCount; i++) { - valueBuffersList.add(valuesBuffer.slice()); - valuesBuffer.position(i * valueSize); - keyBuffersList.add(keysBuffer.slice()); - keysBuffer.position(i * keySize); + valueBuffersList.add(valuesBuffer.slice(i * valueSize, valueSize)); + keyBuffersList.add(keysBuffer.slice(i * keySize, keySize)); } } @@ -181,7 +199,7 @@ public void freeSliceBuffers() { } @Benchmark - public List multiGet10() throws RocksDBException { + public void multiGetList10() throws RocksDBException { final int fromKeyIdx = next(multiGetSize, keyCount); if (fromKeyIdx >= 0) { final List keys = keys(fromKeyIdx, fromKeyIdx + multiGetSize); @@ -191,6 +209,53 @@ public List multiGet10() throws RocksDBException { throw new RuntimeException("Test valueSize assumption wrong"); } } + } + + @Benchmark + public void multiGetListExplicitCF20() throws RocksDBException { + final int fromKeyIdx = next(multiGetSize, keyCount); + if (fromKeyIdx >= 0) { + final List keys = keys(fromKeyIdx, fromKeyIdx + multiGetSize); + final List columnFamilyHandles = + defaultCFHandles.subList(fromKeyIdx, fromKeyIdx + multiGetSize); + final List valueResults = db.multiGetAsList(columnFamilyHandles, keys); + for (final byte[] result : valueResults) { + if (result.length != valueSize) + throw new RuntimeException("Test valueSize assumption wrong"); + } + } + } + + @Benchmark + public void multiGetListRandomCF30() throws RocksDBException { + final int fromKeyIdx = next(multiGetSize, keyCount); + if (fromKeyIdx >= 0) { + final List keys = keys(fromKeyIdx, fromKeyIdx + multiGetSize); + final List columnFamilyHandles = + randomCFHandles.subList(fromKeyIdx, fromKeyIdx + multiGetSize); + final List valueResults = db.multiGetAsList(columnFamilyHandles, keys); + for (final byte[] result : valueResults) { + if (result.length != valueSize) + throw new RuntimeException("Test valueSize assumption wrong"); + } + } + } + + @Benchmark + public List multiGetBB200() throws RocksDBException { + final int fromKeyIdx = next(multiGetSize, keyCount); + if (fromKeyIdx >= 0) { + final List keys = keys(keyBuffersList, fromKeyIdx, fromKeyIdx + multiGetSize); + final List values = + valueBuffersList.subList(fromKeyIdx, fromKeyIdx + multiGetSize); + final List statusResults = db.multiGetByteBuffers(keys, values); + for (final ByteBufferGetStatus result : statusResults) { + if (result.status.getCode() != Status.Code.Ok) + throw new RuntimeException("Test status not OK: " + result.status); + if (result.value.limit() != valueSize) + throw new RuntimeException("Test valueSize assumption wrong"); + } + } return new ArrayList<>(); } diff --git a/java/jmh/src/main/java/org/rocksdb/jmh/PutBenchmarks.java b/java/jmh/src/main/java/org/rocksdb/jmh/PutBenchmarks.java index 5aae21cb9ad..cf82401c1f4 100644 --- a/java/jmh/src/main/java/org/rocksdb/jmh/PutBenchmarks.java +++ b/java/jmh/src/main/java/org/rocksdb/jmh/PutBenchmarks.java @@ -6,18 +6,19 @@ */ package org.rocksdb.jmh; -import org.openjdk.jmh.annotations.*; -import org.rocksdb.*; -import org.rocksdb.util.FileUtils; +import static org.rocksdb.util.KVUtils.ba; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; - -import static org.rocksdb.util.KVUtils.ba; +import org.openjdk.jmh.annotations.*; +import org.rocksdb.*; +import org.rocksdb.util.FileUtils; @State(Scope.Benchmark) public class PutBenchmarks { @@ -30,12 +31,24 @@ public class PutBenchmarks { }) String columnFamilyTestType; + @Param({"1000", "100000"}) int keyCount; + + @Param({"12", "64", "128"}) int keySize; + + @Param({"64", "1024", "65536"}) int valueSize; + + @Param({"16"}) int bufferListSize; + Path dbDir; DBOptions options; int cfs = 0; // number of column families private AtomicInteger cfHandlesIdx; ColumnFamilyHandle[] cfHandles; RocksDB db; + List keyBuffers = new ArrayList<>(bufferListSize); + List valueBuffers = new ArrayList<>(bufferListSize); + List keyBuffersBB = new ArrayList<>(bufferListSize); + List valueBuffersBB = new ArrayList<>(bufferListSize); @Setup(Level.Trial) public void setup() throws IOException, RocksDBException { @@ -68,6 +81,34 @@ public void setup() throws IOException, RocksDBException { final List cfHandlesList = new ArrayList<>(cfDescriptors.size()); db = RocksDB.open(options, dbDir.toAbsolutePath().toString(), cfDescriptors, cfHandlesList); cfHandles = cfHandlesList.toArray(new ColumnFamilyHandle[0]); + + for (int i = 0; i < bufferListSize; i++) { + final byte[] keyArr = new byte[keySize]; + Arrays.fill(keyArr, (byte) 0x30); + keyBuffers.add(keyArr); + } + + for (int i = 0; i < bufferListSize; i++) { + final byte[] valueArr = new byte[valueSize]; + Arrays.fill(valueArr, (byte) 0x30); + valueBuffers.add(valueArr); + } + + for (int i = 0; i < bufferListSize; i++) { + final ByteBuffer keyBB = ByteBuffer.allocateDirect(keySize); + byte[] keyArr = new byte[keySize]; + Arrays.fill(keyArr, (byte) 0x30); + keyBB.put(keyArr); + keyBuffersBB.add(keyBB); + } + + for (int i = 0; i < bufferListSize; i++) { + final ByteBuffer valueBB = ByteBuffer.allocateDirect(valueSize); + byte[] valueArr = new byte[valueSize]; + Arrays.fill(valueArr, (byte) 0x30); + valueBB.put(valueArr); + valueBuffersBB.add(valueBB); + } } @TearDown(Level.Trial) @@ -104,9 +145,79 @@ public int next() { } } + private T borrow(final List buffers) { + synchronized (buffers) { + while (true) { + if (buffers.isEmpty()) { + try { + Thread.sleep(1000); + } catch (InterruptedException ie) { + return null; + } + continue; + } + return buffers.remove(0); + } + } + } + + private void repay(final List buffers, final T buffer) { + synchronized (buffers) { + buffers.add(buffer); + } + } + @Benchmark - public void put(final ComparatorBenchmarks.Counter counter) throws RocksDBException { + public void put(final Counter counter) throws RocksDBException { + byte[] keyBuf = borrow(keyBuffers); + byte[] valueBuf = borrow(valueBuffers); + + final int i = counter.next(); + final byte[] keyPrefix = ba("key" + i); + final byte[] valuePrefix = ba("value" + i); + System.arraycopy(keyPrefix, 0, keyBuf, 0, keyPrefix.length); + System.arraycopy(valuePrefix, 0, valueBuf, 0, valuePrefix.length); + db.put(getColumnFamily(), keyBuf, valueBuf); + + repay(keyBuffers, keyBuf); + repay(valueBuffers, valueBuf); + } + + @Benchmark + public void putByteArrays(final Counter counter) throws RocksDBException { + byte[] keyBuf = borrow(keyBuffers); + byte[] valueBuf = borrow(valueBuffers); + + final int i = counter.next(); + final byte[] keyPrefix = ba("key" + i); + final byte[] valuePrefix = ba("value" + i); + System.arraycopy(keyPrefix, 0, keyBuf, 0, keyPrefix.length); + System.arraycopy(valuePrefix, 0, valueBuf, 0, valuePrefix.length); + db.put(getColumnFamily(), new WriteOptions(), keyBuf, valueBuf); + + repay(keyBuffers, keyBuf); + repay(valueBuffers, valueBuf); + } + + @Benchmark + public void putByteBuffers(final Counter counter) throws RocksDBException { + ByteBuffer keyBuf = borrow(keyBuffersBB); + keyBuf.clear(); + ByteBuffer valueBuf = borrow(valueBuffersBB); + valueBuf.clear(); + final int i = counter.next(); - db.put(getColumnFamily(), ba("key" + i), ba("value" + i)); + final byte[] keyPrefix = ba("key" + i); + final byte[] valuePrefix = ba("value" + i); + keyBuf.put(keyPrefix, 0, keyPrefix.length); + keyBuf.position(keySize); + keyBuf.flip(); + valueBuf.put(valuePrefix, 0, valuePrefix.length); + valueBuf.position(valueSize); + valueBuf.flip(); + db.put(getColumnFamily(), new WriteOptions(), keyBuf, valueBuf); + + repay(keyBuffersBB, keyBuf); + repay(valueBuffersBB, valueBuf); } } diff --git a/java/pmd-rules.xml b/java/pmd-rules.xml index b710277f148..97ce0362994 100644 --- a/java/pmd-rules.xml +++ b/java/pmd-rules.xml @@ -21,6 +21,7 @@ + diff --git a/java/rocksjni/backup_engine_options.cc b/java/rocksjni/backup_engine_options.cc index 25bfb672062..4304a48617d 100644 --- a/java/rocksjni/backup_engine_options.cc +++ b/java/rocksjni/backup_engine_options.cc @@ -45,7 +45,7 @@ jlong Java_org_rocksdb_BackupEngineOptions_newBackupEngineOptions( * Signature: (J)Ljava/lang/String; */ jstring Java_org_rocksdb_BackupEngineOptions_backupDir(JNIEnv* env, - jobject /*jopt*/, + jclass /*jcls*/, jlong jhandle) { auto* bopt = reinterpret_cast(jhandle); @@ -58,7 +58,7 @@ jstring Java_org_rocksdb_BackupEngineOptions_backupDir(JNIEnv* env, * Signature: (JJ)V */ void Java_org_rocksdb_BackupEngineOptions_setBackupEnv( - JNIEnv* /*env*/, jobject /*jopt*/, jlong jhandle, jlong jrocks_env_handle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jrocks_env_handle) { auto* bopt = reinterpret_cast(jhandle); auto* rocks_env = @@ -72,7 +72,7 @@ void Java_org_rocksdb_BackupEngineOptions_setBackupEnv( * Signature: (JZ)V */ void Java_org_rocksdb_BackupEngineOptions_setShareTableFiles(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jboolean flag) { auto* bopt = @@ -86,7 +86,7 @@ void Java_org_rocksdb_BackupEngineOptions_setShareTableFiles(JNIEnv* /*env*/, * Signature: (J)Z */ jboolean Java_org_rocksdb_BackupEngineOptions_shareTableFiles(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* bopt = reinterpret_cast(jhandle); @@ -99,7 +99,7 @@ jboolean Java_org_rocksdb_BackupEngineOptions_shareTableFiles(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_BackupEngineOptions_setInfoLog(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jlong /*jlogger_handle*/) { auto* bopt = @@ -116,7 +116,7 @@ void Java_org_rocksdb_BackupEngineOptions_setInfoLog(JNIEnv* /*env*/, * Signature: (JZ)V */ void Java_org_rocksdb_BackupEngineOptions_setSync(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jboolean flag) { auto* bopt = @@ -130,7 +130,7 @@ void Java_org_rocksdb_BackupEngineOptions_setSync(JNIEnv* /*env*/, * Signature: (J)Z */ jboolean Java_org_rocksdb_BackupEngineOptions_sync(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* bopt = reinterpret_cast(jhandle); @@ -143,7 +143,7 @@ jboolean Java_org_rocksdb_BackupEngineOptions_sync(JNIEnv* /*env*/, * Signature: (JZ)V */ void Java_org_rocksdb_BackupEngineOptions_setDestroyOldData(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jboolean flag) { auto* bopt = @@ -157,7 +157,7 @@ void Java_org_rocksdb_BackupEngineOptions_setDestroyOldData(JNIEnv* /*env*/, * Signature: (J)Z */ jboolean Java_org_rocksdb_BackupEngineOptions_destroyOldData(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* bopt = reinterpret_cast(jhandle); @@ -170,7 +170,7 @@ jboolean Java_org_rocksdb_BackupEngineOptions_destroyOldData(JNIEnv* /*env*/, * Signature: (JZ)V */ void Java_org_rocksdb_BackupEngineOptions_setBackupLogFiles(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jboolean flag) { auto* bopt = @@ -184,7 +184,7 @@ void Java_org_rocksdb_BackupEngineOptions_setBackupLogFiles(JNIEnv* /*env*/, * Signature: (J)Z */ jboolean Java_org_rocksdb_BackupEngineOptions_backupLogFiles(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* bopt = reinterpret_cast(jhandle); @@ -197,8 +197,7 @@ jboolean Java_org_rocksdb_BackupEngineOptions_backupLogFiles(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_BackupEngineOptions_setBackupRateLimit( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, - jlong jbackup_rate_limit) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jbackup_rate_limit) { auto* bopt = reinterpret_cast(jhandle); bopt->backup_rate_limit = jbackup_rate_limit; @@ -210,7 +209,7 @@ void Java_org_rocksdb_BackupEngineOptions_setBackupRateLimit( * Signature: (J)J */ jlong Java_org_rocksdb_BackupEngineOptions_backupRateLimit(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* bopt = reinterpret_cast(jhandle); @@ -223,7 +222,7 @@ jlong Java_org_rocksdb_BackupEngineOptions_backupRateLimit(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_BackupEngineOptions_setBackupRateLimiter( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jrate_limiter_handle) { auto* bopt = reinterpret_cast(jhandle); @@ -239,7 +238,7 @@ void Java_org_rocksdb_BackupEngineOptions_setBackupRateLimiter( * Signature: (JJ)V */ void Java_org_rocksdb_BackupEngineOptions_setRestoreRateLimit( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jrestore_rate_limit) { auto* bopt = reinterpret_cast(jhandle); @@ -252,7 +251,7 @@ void Java_org_rocksdb_BackupEngineOptions_setRestoreRateLimit( * Signature: (J)J */ jlong Java_org_rocksdb_BackupEngineOptions_restoreRateLimit(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* bopt = reinterpret_cast(jhandle); @@ -265,7 +264,7 @@ jlong Java_org_rocksdb_BackupEngineOptions_restoreRateLimit(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_BackupEngineOptions_setRestoreRateLimiter( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jrate_limiter_handle) { auto* bopt = reinterpret_cast(jhandle); @@ -281,7 +280,7 @@ void Java_org_rocksdb_BackupEngineOptions_setRestoreRateLimiter( * Signature: (JZ)V */ void Java_org_rocksdb_BackupEngineOptions_setShareFilesWithChecksum( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean flag) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jboolean flag) { auto* bopt = reinterpret_cast(jhandle); bopt->share_files_with_checksum = flag; @@ -293,7 +292,7 @@ void Java_org_rocksdb_BackupEngineOptions_setShareFilesWithChecksum( * Signature: (J)Z */ jboolean Java_org_rocksdb_BackupEngineOptions_shareFilesWithChecksum( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* bopt = reinterpret_cast(jhandle); return bopt->share_files_with_checksum; @@ -305,7 +304,7 @@ jboolean Java_org_rocksdb_BackupEngineOptions_shareFilesWithChecksum( * Signature: (JI)V */ void Java_org_rocksdb_BackupEngineOptions_setMaxBackgroundOperations( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jint max_background_operations) { auto* bopt = reinterpret_cast(jhandle); @@ -318,7 +317,7 @@ void Java_org_rocksdb_BackupEngineOptions_setMaxBackgroundOperations( * Signature: (J)I */ jint Java_org_rocksdb_BackupEngineOptions_maxBackgroundOperations( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* bopt = reinterpret_cast(jhandle); return static_cast(bopt->max_background_operations); @@ -330,7 +329,7 @@ jint Java_org_rocksdb_BackupEngineOptions_maxBackgroundOperations( * Signature: (JJ)V */ void Java_org_rocksdb_BackupEngineOptions_setCallbackTriggerIntervalSize( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jcallback_trigger_interval_size) { auto* bopt = reinterpret_cast(jhandle); @@ -344,7 +343,7 @@ void Java_org_rocksdb_BackupEngineOptions_setCallbackTriggerIntervalSize( * Signature: (J)J */ jlong Java_org_rocksdb_BackupEngineOptions_callbackTriggerIntervalSize( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* bopt = reinterpret_cast(jhandle); return static_cast(bopt->callback_trigger_interval_size); @@ -355,9 +354,9 @@ jlong Java_org_rocksdb_BackupEngineOptions_callbackTriggerIntervalSize( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_BackupEngineOptions_disposeInternal(JNIEnv* /*env*/, - jobject /*jopt*/, - jlong jhandle) { +void Java_org_rocksdb_BackupEngineOptions_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jhandle) { auto* bopt = reinterpret_cast(jhandle); assert(bopt != nullptr); diff --git a/java/rocksjni/backupenginejni.cc b/java/rocksjni/backupenginejni.cc index 1ba7ea28647..8d25b1a6ee3 100644 --- a/java/rocksjni/backupenginejni.cc +++ b/java/rocksjni/backupenginejni.cc @@ -45,7 +45,7 @@ jlong Java_org_rocksdb_BackupEngine_open(JNIEnv* env, jclass /*jcls*/, * Signature: (JJZ)V */ void Java_org_rocksdb_BackupEngine_createNewBackup( - JNIEnv* env, jobject /*jbe*/, jlong jbe_handle, jlong db_handle, + JNIEnv* env, jclass /*jbe*/, jlong jbe_handle, jlong db_handle, jboolean jflush_before_backup) { auto* db = reinterpret_cast(db_handle); auto* backup_engine = @@ -66,7 +66,7 @@ void Java_org_rocksdb_BackupEngine_createNewBackup( * Signature: (JJLjava/lang/String;Z)V */ void Java_org_rocksdb_BackupEngine_createNewBackupWithMetadata( - JNIEnv* env, jobject /*jbe*/, jlong jbe_handle, jlong db_handle, + JNIEnv* env, jclass /*jbe*/, jlong jbe_handle, jlong db_handle, jstring japp_metadata, jboolean jflush_before_backup) { auto* db = reinterpret_cast(db_handle); auto* backup_engine = @@ -97,7 +97,7 @@ void Java_org_rocksdb_BackupEngine_createNewBackupWithMetadata( * Signature: (J)Ljava/util/List; */ jobject Java_org_rocksdb_BackupEngine_getBackupInfo(JNIEnv* env, - jobject /*jbe*/, + jclass /*jcls*/, jlong jbe_handle) { auto* backup_engine = reinterpret_cast(jbe_handle); @@ -112,7 +112,7 @@ jobject Java_org_rocksdb_BackupEngine_getBackupInfo(JNIEnv* env, * Signature: (J)[I */ jintArray Java_org_rocksdb_BackupEngine_getCorruptedBackups(JNIEnv* env, - jobject /*jbe*/, + jclass /*jcls*/, jlong jbe_handle) { auto* backup_engine = reinterpret_cast(jbe_handle); @@ -139,7 +139,7 @@ jintArray Java_org_rocksdb_BackupEngine_getCorruptedBackups(JNIEnv* env, * Method: garbageCollect * Signature: (J)V */ -void Java_org_rocksdb_BackupEngine_garbageCollect(JNIEnv* env, jobject /*jbe*/, +void Java_org_rocksdb_BackupEngine_garbageCollect(JNIEnv* env, jclass /*jbe*/, jlong jbe_handle) { auto* backup_engine = reinterpret_cast(jbe_handle); @@ -157,7 +157,7 @@ void Java_org_rocksdb_BackupEngine_garbageCollect(JNIEnv* env, jobject /*jbe*/, * Method: purgeOldBackups * Signature: (JI)V */ -void Java_org_rocksdb_BackupEngine_purgeOldBackups(JNIEnv* env, jobject /*jbe*/, +void Java_org_rocksdb_BackupEngine_purgeOldBackups(JNIEnv* env, jclass /*jbe*/, jlong jbe_handle, jint jnum_backups_to_keep) { auto* backup_engine = @@ -177,7 +177,7 @@ void Java_org_rocksdb_BackupEngine_purgeOldBackups(JNIEnv* env, jobject /*jbe*/, * Method: deleteBackup * Signature: (JI)V */ -void Java_org_rocksdb_BackupEngine_deleteBackup(JNIEnv* env, jobject /*jbe*/, +void Java_org_rocksdb_BackupEngine_deleteBackup(JNIEnv* env, jclass /*jbe*/, jlong jbe_handle, jint jbackup_id) { auto* backup_engine = @@ -198,7 +198,7 @@ void Java_org_rocksdb_BackupEngine_deleteBackup(JNIEnv* env, jobject /*jbe*/, * Signature: (JILjava/lang/String;Ljava/lang/String;J)V */ void Java_org_rocksdb_BackupEngine_restoreDbFromBackup( - JNIEnv* env, jobject /*jbe*/, jlong jbe_handle, jint jbackup_id, + JNIEnv* env, jclass /*jbe*/, jlong jbe_handle, jint jbackup_id, jstring jdb_dir, jstring jwal_dir, jlong jrestore_options_handle) { auto* backup_engine = reinterpret_cast(jbe_handle); @@ -235,7 +235,7 @@ void Java_org_rocksdb_BackupEngine_restoreDbFromBackup( * Signature: (JLjava/lang/String;Ljava/lang/String;J)V */ void Java_org_rocksdb_BackupEngine_restoreDbFromLatestBackup( - JNIEnv* env, jobject /*jbe*/, jlong jbe_handle, jstring jdb_dir, + JNIEnv* env, jclass /*jbe*/, jlong jbe_handle, jstring jdb_dir, jstring jwal_dir, jlong jrestore_options_handle) { auto* backup_engine = reinterpret_cast(jbe_handle); @@ -270,9 +270,9 @@ void Java_org_rocksdb_BackupEngine_restoreDbFromLatestBackup( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_BackupEngine_disposeInternal(JNIEnv* /*env*/, - jobject /*jbe*/, - jlong jbe_handle) { +void Java_org_rocksdb_BackupEngine_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jbe_handle) { auto* be = reinterpret_cast(jbe_handle); assert(be != nullptr); delete be; diff --git a/java/rocksjni/cassandra_value_operator.cc b/java/rocksjni/cassandra_value_operator.cc index 6de28c1b1cf..0a7153610c1 100644 --- a/java/rocksjni/cassandra_value_operator.cc +++ b/java/rocksjni/cassandra_value_operator.cc @@ -41,8 +41,8 @@ jlong Java_org_rocksdb_CassandraValueMergeOperator_newSharedCassandraValueMergeO * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CassandraValueMergeOperator_disposeInternal( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { +void Java_org_rocksdb_CassandraValueMergeOperator_disposeInternalJni( + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* op = reinterpret_cast*>( jhandle); diff --git a/java/rocksjni/checkpoint.cc b/java/rocksjni/checkpoint.cc index cef5f3ca88e..09eca34f2c4 100644 --- a/java/rocksjni/checkpoint.cc +++ b/java/rocksjni/checkpoint.cc @@ -37,9 +37,9 @@ jlong Java_org_rocksdb_Checkpoint_newCheckpoint(JNIEnv* /*env*/, * Method: dispose * Signature: (J)V */ -void Java_org_rocksdb_Checkpoint_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_Checkpoint_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jobj*/, + jlong jhandle) { auto* checkpoint = reinterpret_cast(jhandle); assert(checkpoint != nullptr); delete checkpoint; @@ -50,7 +50,7 @@ void Java_org_rocksdb_Checkpoint_disposeInternal(JNIEnv* /*env*/, * Method: createCheckpoint * Signature: (JLjava/lang/String;)V */ -void Java_org_rocksdb_Checkpoint_createCheckpoint(JNIEnv* env, jobject /*jobj*/, +void Java_org_rocksdb_Checkpoint_createCheckpoint(JNIEnv* env, jclass /*jcls*/, jlong jcheckpoint_handle, jstring jcheckpoint_path) { const char* checkpoint_path = env->GetStringUTFChars(jcheckpoint_path, 0); diff --git a/java/rocksjni/clock_cache.cc b/java/rocksjni/clock_cache.cc index e04991aa9c0..5801311a391 100644 --- a/java/rocksjni/clock_cache.cc +++ b/java/rocksjni/clock_cache.cc @@ -33,9 +33,9 @@ jlong Java_org_rocksdb_ClockCache_newClockCache( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_ClockCache_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_ClockCache_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jhandle) { auto* sptr_clock_cache = reinterpret_cast*>(jhandle); delete sptr_clock_cache; // delete std::shared_ptr diff --git a/java/rocksjni/columnfamilyhandle.cc b/java/rocksjni/columnfamilyhandle.cc index 4140580f02f..70456a65ed0 100644 --- a/java/rocksjni/columnfamilyhandle.cc +++ b/java/rocksjni/columnfamilyhandle.cc @@ -19,7 +19,7 @@ * Signature: (J)[B */ jbyteArray Java_org_rocksdb_ColumnFamilyHandle_getName(JNIEnv* env, - jobject /*jobj*/, + jclass /*jobj*/, jlong jhandle) { auto* cfh = reinterpret_cast(jhandle); std::string cf_name = cfh->GetName(); @@ -31,8 +31,7 @@ jbyteArray Java_org_rocksdb_ColumnFamilyHandle_getName(JNIEnv* env, * Method: getID * Signature: (J)I */ -jint Java_org_rocksdb_ColumnFamilyHandle_getID(JNIEnv* /*env*/, - jobject /*jobj*/, +jint Java_org_rocksdb_ColumnFamilyHandle_getID(JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* cfh = reinterpret_cast(jhandle); const int32_t id = cfh->GetID(); @@ -45,7 +44,7 @@ jint Java_org_rocksdb_ColumnFamilyHandle_getID(JNIEnv* /*env*/, * Signature: (J)Lorg/rocksdb/ColumnFamilyDescriptor; */ jobject Java_org_rocksdb_ColumnFamilyHandle_getDescriptor(JNIEnv* env, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* cfh = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::ColumnFamilyDescriptor desc; @@ -63,9 +62,9 @@ jobject Java_org_rocksdb_ColumnFamilyHandle_getDescriptor(JNIEnv* env, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_ColumnFamilyHandle_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_ColumnFamilyHandle_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jobj*/, + jlong jhandle) { auto* cfh = reinterpret_cast(jhandle); assert(cfh != nullptr); delete cfh; diff --git a/java/rocksjni/compact_range_options.cc b/java/rocksjni/compact_range_options.cc index d07263ab683..a374795c4df 100644 --- a/java/rocksjni/compact_range_options.cc +++ b/java/rocksjni/compact_range_options.cc @@ -80,7 +80,7 @@ jlong Java_org_rocksdb_CompactRangeOptions_newCompactRangeOptions( * Signature: (J)Z */ jboolean Java_org_rocksdb_CompactRangeOptions_exclusiveManualCompaction( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + JNIEnv* /*env*/, jclass /*jobj*/, jlong jhandle) { auto* options = reinterpret_cast(jhandle); return static_cast( @@ -93,7 +93,7 @@ jboolean Java_org_rocksdb_CompactRangeOptions_exclusiveManualCompaction( * Signature: (JZ)V */ void Java_org_rocksdb_CompactRangeOptions_setExclusiveManualCompaction( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jobj*/, jlong jhandle, jboolean exclusive_manual_compaction) { auto* options = reinterpret_cast(jhandle); @@ -107,7 +107,7 @@ void Java_org_rocksdb_CompactRangeOptions_setExclusiveManualCompaction( * Signature: (J)I */ jint Java_org_rocksdb_CompactRangeOptions_bottommostLevelCompaction( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + JNIEnv* /*env*/, jclass /*jobj*/, jlong jhandle) { auto* options = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::BottommostLevelCompactionJni:: @@ -121,7 +121,7 @@ jint Java_org_rocksdb_CompactRangeOptions_bottommostLevelCompaction( * Signature: (JI)V */ void Java_org_rocksdb_CompactRangeOptions_setBottommostLevelCompaction( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jobj*/, jlong jhandle, jint bottommost_level_compaction) { auto* options = reinterpret_cast(jhandle); @@ -136,7 +136,7 @@ void Java_org_rocksdb_CompactRangeOptions_setBottommostLevelCompaction( * Signature: (J)Z */ jboolean Java_org_rocksdb_CompactRangeOptions_changeLevel(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jobj*/, jlong jhandle) { auto* options = reinterpret_cast(jhandle); @@ -149,7 +149,7 @@ jboolean Java_org_rocksdb_CompactRangeOptions_changeLevel(JNIEnv* /*env*/, * Signature: (JZ)V */ void Java_org_rocksdb_CompactRangeOptions_setChangeLevel( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean change_level) { + JNIEnv* /*env*/, jclass /*jobj*/, jlong jhandle, jboolean change_level) { auto* options = reinterpret_cast(jhandle); options->compactRangeOptions.change_level = static_cast(change_level); @@ -161,7 +161,7 @@ void Java_org_rocksdb_CompactRangeOptions_setChangeLevel( * Signature: (J)I */ jint Java_org_rocksdb_CompactRangeOptions_targetLevel(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jobj*/, jlong jhandle) { auto* options = reinterpret_cast(jhandle); @@ -174,7 +174,7 @@ jint Java_org_rocksdb_CompactRangeOptions_targetLevel(JNIEnv* /*env*/, * Signature: (JI)V */ void Java_org_rocksdb_CompactRangeOptions_setTargetLevel(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jobj*/, jlong jhandle, jint target_level) { auto* options = @@ -188,7 +188,7 @@ void Java_org_rocksdb_CompactRangeOptions_setTargetLevel(JNIEnv* /*env*/, * Signature: (J)I */ jint Java_org_rocksdb_CompactRangeOptions_targetPathId(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jobj*/, jlong jhandle) { auto* options = reinterpret_cast(jhandle); @@ -201,7 +201,7 @@ jint Java_org_rocksdb_CompactRangeOptions_targetPathId(JNIEnv* /*env*/, * Signature: (JI)V */ void Java_org_rocksdb_CompactRangeOptions_setTargetPathId(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jobj*/, jlong jhandle, jint target_path_id) { auto* options = @@ -216,7 +216,7 @@ void Java_org_rocksdb_CompactRangeOptions_setTargetPathId(JNIEnv* /*env*/, * Signature: (J)Z */ jboolean Java_org_rocksdb_CompactRangeOptions_allowWriteStall(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jobj*/, jlong jhandle) { auto* options = reinterpret_cast(jhandle); @@ -229,7 +229,7 @@ jboolean Java_org_rocksdb_CompactRangeOptions_allowWriteStall(JNIEnv* /*env*/, * Signature: (JZ)V */ void Java_org_rocksdb_CompactRangeOptions_setAllowWriteStall( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jobj*/, jlong jhandle, jboolean allow_write_stall) { auto* options = reinterpret_cast(jhandle); @@ -243,7 +243,7 @@ void Java_org_rocksdb_CompactRangeOptions_setAllowWriteStall( * Signature: (J)I */ jint Java_org_rocksdb_CompactRangeOptions_maxSubcompactions(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jobj*/, jlong jhandle) { auto* options = reinterpret_cast(jhandle); @@ -256,7 +256,7 @@ jint Java_org_rocksdb_CompactRangeOptions_maxSubcompactions(JNIEnv* /*env*/, * Signature: (JI)V */ void Java_org_rocksdb_CompactRangeOptions_setMaxSubcompactions( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint max_subcompactions) { + JNIEnv* /*env*/, jclass /*jobj*/, jlong jhandle, jint max_subcompactions) { auto* options = reinterpret_cast(jhandle); options->compactRangeOptions.max_subcompactions = @@ -268,7 +268,7 @@ void Java_org_rocksdb_CompactRangeOptions_setMaxSubcompactions( * Method: setFullHistoryTSLow * Signature: (JJJ)V */ -void Java_org_rocksdb_CompactRangeOptions_setFullHistoryTSLow(JNIEnv*, jobject, +void Java_org_rocksdb_CompactRangeOptions_setFullHistoryTSLow(JNIEnv*, jclass, jlong jhandle, jlong start, jlong range) { @@ -283,7 +283,7 @@ void Java_org_rocksdb_CompactRangeOptions_setFullHistoryTSLow(JNIEnv*, jobject, * Signature: (J)Lorg/rocksdb/CompactRangeOptions/Timestamp; */ jobject Java_org_rocksdb_CompactRangeOptions_fullHistoryTSLow(JNIEnv* env, - jobject, + jclass, jlong jhandle) { auto* options = reinterpret_cast(jhandle); @@ -304,7 +304,7 @@ jobject Java_org_rocksdb_CompactRangeOptions_fullHistoryTSLow(JNIEnv* env, * Method: setCanceled * Signature: (JZ)V */ -void Java_org_rocksdb_CompactRangeOptions_setCanceled(JNIEnv*, jobject, +void Java_org_rocksdb_CompactRangeOptions_setCanceled(JNIEnv*, jclass, jlong jhandle, jboolean jcanceled) { auto* options = @@ -317,7 +317,7 @@ void Java_org_rocksdb_CompactRangeOptions_setCanceled(JNIEnv*, jobject, * Method: canceled * Signature: (J)Z */ -jboolean Java_org_rocksdb_CompactRangeOptions_canceled(JNIEnv*, jobject, +jboolean Java_org_rocksdb_CompactRangeOptions_canceled(JNIEnv*, jclass, jlong jhandle) { auto* options = reinterpret_cast(jhandle); @@ -329,9 +329,9 @@ jboolean Java_org_rocksdb_CompactRangeOptions_canceled(JNIEnv*, jobject, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompactRangeOptions_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_CompactRangeOptions_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); delete options; diff --git a/java/rocksjni/compaction_filter_factory.cc b/java/rocksjni/compaction_filter_factory.cc index 16fbdbbddd6..61cf858fa45 100644 --- a/java/rocksjni/compaction_filter_factory.cc +++ b/java/rocksjni/compaction_filter_factory.cc @@ -34,7 +34,7 @@ jlong Java_org_rocksdb_AbstractCompactionFilterFactory_createNewCompactionFilter * Signature: (J)V */ void Java_org_rocksdb_AbstractCompactionFilterFactory_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* ptr_sptr_cff = reinterpret_cast< std::shared_ptr*>( jhandle); diff --git a/java/rocksjni/compaction_job_info.cc b/java/rocksjni/compaction_job_info.cc index fb292f59ce5..b3a48b3737a 100644 --- a/java/rocksjni/compaction_job_info.cc +++ b/java/rocksjni/compaction_job_info.cc @@ -28,8 +28,8 @@ jlong Java_org_rocksdb_CompactionJobInfo_newCompactionJobInfo(JNIEnv*, jclass) { * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompactionJobInfo_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_CompactionJobInfo_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); delete compact_job_info; diff --git a/java/rocksjni/compaction_job_stats.cc b/java/rocksjni/compaction_job_stats.cc index a2599c1321f..788a846efb3 100644 --- a/java/rocksjni/compaction_job_stats.cc +++ b/java/rocksjni/compaction_job_stats.cc @@ -30,8 +30,8 @@ jlong Java_org_rocksdb_CompactionJobStats_newCompactionJobStats(JNIEnv*, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompactionJobStats_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_CompactionJobStats_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); delete compact_job_stats; diff --git a/java/rocksjni/compaction_options.cc b/java/rocksjni/compaction_options.cc index bbbde0313fa..fe5e8edf91b 100644 --- a/java/rocksjni/compaction_options.cc +++ b/java/rocksjni/compaction_options.cc @@ -28,8 +28,8 @@ jlong Java_org_rocksdb_CompactionOptions_newCompactionOptions(JNIEnv*, jclass) { * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompactionOptions_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_CompactionOptions_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { auto* compact_opts = reinterpret_cast(jhandle); delete compact_opts; diff --git a/java/rocksjni/compaction_options_fifo.cc b/java/rocksjni/compaction_options_fifo.cc index f6a47fec5b9..535562fb47f 100644 --- a/java/rocksjni/compaction_options_fifo.cc +++ b/java/rocksjni/compaction_options_fifo.cc @@ -29,7 +29,7 @@ jlong Java_org_rocksdb_CompactionOptionsFIFO_newCompactionOptionsFIFO(JNIEnv*, * Signature: (JJ)V */ void Java_org_rocksdb_CompactionOptionsFIFO_setMaxTableFilesSize( - JNIEnv*, jobject, jlong jhandle, jlong jmax_table_files_size) { + JNIEnv*, jclass, jlong jhandle, jlong jmax_table_files_size) { auto* opt = reinterpret_cast(jhandle); opt->max_table_files_size = static_cast(jmax_table_files_size); @@ -40,7 +40,7 @@ void Java_org_rocksdb_CompactionOptionsFIFO_setMaxTableFilesSize( * Method: maxTableFilesSize * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionOptionsFIFO_maxTableFilesSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_CompactionOptionsFIFO_maxTableFilesSize(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); @@ -53,7 +53,7 @@ jlong Java_org_rocksdb_CompactionOptionsFIFO_maxTableFilesSize(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_CompactionOptionsFIFO_setAllowCompaction( - JNIEnv*, jobject, jlong jhandle, jboolean allow_compaction) { + JNIEnv*, jclass, jlong jhandle, jboolean allow_compaction) { auto* opt = reinterpret_cast(jhandle); opt->allow_compaction = static_cast(allow_compaction); @@ -64,8 +64,7 @@ void Java_org_rocksdb_CompactionOptionsFIFO_setAllowCompaction( * Method: allowCompaction * Signature: (J)Z */ -jboolean Java_org_rocksdb_CompactionOptionsFIFO_allowCompaction(JNIEnv*, - jobject, +jboolean Java_org_rocksdb_CompactionOptionsFIFO_allowCompaction(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); @@ -77,7 +76,7 @@ jboolean Java_org_rocksdb_CompactionOptionsFIFO_allowCompaction(JNIEnv*, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompactionOptionsFIFO_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_CompactionOptionsFIFO_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { delete reinterpret_cast(jhandle); } diff --git a/java/rocksjni/compaction_options_universal.cc b/java/rocksjni/compaction_options_universal.cc index 9fc6f315828..ed4dfff3d0f 100644 --- a/java/rocksjni/compaction_options_universal.cc +++ b/java/rocksjni/compaction_options_universal.cc @@ -30,7 +30,7 @@ jlong Java_org_rocksdb_CompactionOptionsUniversal_newCompactionOptionsUniversal( * Signature: (JI)V */ void Java_org_rocksdb_CompactionOptionsUniversal_setSizeRatio( - JNIEnv*, jobject, jlong jhandle, jint jsize_ratio) { + JNIEnv*, jclass, jlong jhandle, jint jsize_ratio) { auto* opt = reinterpret_cast(jhandle); opt->size_ratio = static_cast(jsize_ratio); @@ -41,7 +41,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setSizeRatio( * Method: sizeRatio * Signature: (J)I */ -jint Java_org_rocksdb_CompactionOptionsUniversal_sizeRatio(JNIEnv*, jobject, +jint Java_org_rocksdb_CompactionOptionsUniversal_sizeRatio(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); @@ -54,7 +54,7 @@ jint Java_org_rocksdb_CompactionOptionsUniversal_sizeRatio(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_CompactionOptionsUniversal_setMinMergeWidth( - JNIEnv*, jobject, jlong jhandle, jint jmin_merge_width) { + JNIEnv*, jclass, jlong jhandle, jint jmin_merge_width) { auto* opt = reinterpret_cast(jhandle); opt->min_merge_width = static_cast(jmin_merge_width); @@ -65,7 +65,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setMinMergeWidth( * Method: minMergeWidth * Signature: (J)I */ -jint Java_org_rocksdb_CompactionOptionsUniversal_minMergeWidth(JNIEnv*, jobject, +jint Java_org_rocksdb_CompactionOptionsUniversal_minMergeWidth(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); @@ -78,7 +78,7 @@ jint Java_org_rocksdb_CompactionOptionsUniversal_minMergeWidth(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_CompactionOptionsUniversal_setMaxMergeWidth( - JNIEnv*, jobject, jlong jhandle, jint jmax_merge_width) { + JNIEnv*, jclass, jlong jhandle, jint jmax_merge_width) { auto* opt = reinterpret_cast(jhandle); opt->max_merge_width = static_cast(jmax_merge_width); @@ -89,7 +89,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setMaxMergeWidth( * Method: maxMergeWidth * Signature: (J)I */ -jint Java_org_rocksdb_CompactionOptionsUniversal_maxMergeWidth(JNIEnv*, jobject, +jint Java_org_rocksdb_CompactionOptionsUniversal_maxMergeWidth(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); @@ -102,7 +102,7 @@ jint Java_org_rocksdb_CompactionOptionsUniversal_maxMergeWidth(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_CompactionOptionsUniversal_setMaxSizeAmplificationPercent( - JNIEnv*, jobject, jlong jhandle, jint jmax_size_amplification_percent) { + JNIEnv*, jclass, jlong jhandle, jint jmax_size_amplification_percent) { auto* opt = reinterpret_cast(jhandle); opt->max_size_amplification_percent = @@ -115,7 +115,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setMaxSizeAmplificationPercent( * Signature: (J)I */ jint Java_org_rocksdb_CompactionOptionsUniversal_maxSizeAmplificationPercent( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->max_size_amplification_percent); @@ -127,7 +127,7 @@ jint Java_org_rocksdb_CompactionOptionsUniversal_maxSizeAmplificationPercent( * Signature: (JI)V */ void Java_org_rocksdb_CompactionOptionsUniversal_setCompressionSizePercent( - JNIEnv*, jobject, jlong jhandle, jint jcompression_size_percent) { + JNIEnv*, jclass, jlong jhandle, jint jcompression_size_percent) { auto* opt = reinterpret_cast(jhandle); opt->compression_size_percent = @@ -140,7 +140,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setCompressionSizePercent( * Signature: (J)I */ jint Java_org_rocksdb_CompactionOptionsUniversal_compressionSizePercent( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->compression_size_percent); @@ -152,7 +152,7 @@ jint Java_org_rocksdb_CompactionOptionsUniversal_compressionSizePercent( * Signature: (JB)V */ void Java_org_rocksdb_CompactionOptionsUniversal_setStopStyle( - JNIEnv*, jobject, jlong jhandle, jbyte jstop_style_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jstop_style_value) { auto* opt = reinterpret_cast(jhandle); opt->stop_style = @@ -165,7 +165,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setStopStyle( * Method: stopStyle * Signature: (J)B */ -jbyte Java_org_rocksdb_CompactionOptionsUniversal_stopStyle(JNIEnv*, jobject, +jbyte Java_org_rocksdb_CompactionOptionsUniversal_stopStyle(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); @@ -179,7 +179,7 @@ jbyte Java_org_rocksdb_CompactionOptionsUniversal_stopStyle(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_CompactionOptionsUniversal_setAllowTrivialMove( - JNIEnv*, jobject, jlong jhandle, jboolean jallow_trivial_move) { + JNIEnv*, jclass, jlong jhandle, jboolean jallow_trivial_move) { auto* opt = reinterpret_cast(jhandle); opt->allow_trivial_move = static_cast(jallow_trivial_move); @@ -191,7 +191,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setAllowTrivialMove( * Signature: (J)Z */ jboolean Java_org_rocksdb_CompactionOptionsUniversal_allowTrivialMove( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return opt->allow_trivial_move; @@ -202,8 +202,8 @@ jboolean Java_org_rocksdb_CompactionOptionsUniversal_allowTrivialMove( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompactionOptionsUniversal_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_CompactionOptionsUniversal_disposeInternalJni( + JNIEnv*, jclass, jlong jhandle) { delete reinterpret_cast( jhandle); } diff --git a/java/rocksjni/comparator.cc b/java/rocksjni/comparator.cc index 11279c4ce08..be49f5ef176 100644 --- a/java/rocksjni/comparator.cc +++ b/java/rocksjni/comparator.cc @@ -39,8 +39,7 @@ jlong Java_org_rocksdb_AbstractComparator_createNewComparator( * Method: usingDirectBuffers * Signature: (J)Z */ -jboolean Java_org_rocksdb_AbstractComparator_usingDirectBuffers(JNIEnv*, - jobject, +jboolean Java_org_rocksdb_AbstractComparator_usingDirectBuffers(JNIEnv*, jclass, jlong jhandle) { auto* c = reinterpret_cast(jhandle); @@ -53,7 +52,7 @@ jboolean Java_org_rocksdb_AbstractComparator_usingDirectBuffers(JNIEnv*, * Signature: (J)V */ void Java_org_rocksdb_NativeComparatorWrapper_disposeInternal( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jcomparator_handle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jcomparator_handle) { auto* comparator = reinterpret_cast(jcomparator_handle); delete comparator; diff --git a/java/rocksjni/compression_options.cc b/java/rocksjni/compression_options.cc index 53f2405601e..f19822fcb4a 100644 --- a/java/rocksjni/compression_options.cc +++ b/java/rocksjni/compression_options.cc @@ -28,7 +28,7 @@ jlong Java_org_rocksdb_CompressionOptions_newCompressionOptions(JNIEnv*, * Method: setWindowBits * Signature: (JI)V */ -void Java_org_rocksdb_CompressionOptions_setWindowBits(JNIEnv*, jobject, +void Java_org_rocksdb_CompressionOptions_setWindowBits(JNIEnv*, jclass, jlong jhandle, jint jwindow_bits) { auto* opt = reinterpret_cast(jhandle); @@ -40,7 +40,7 @@ void Java_org_rocksdb_CompressionOptions_setWindowBits(JNIEnv*, jobject, * Method: windowBits * Signature: (J)I */ -jint Java_org_rocksdb_CompressionOptions_windowBits(JNIEnv*, jobject, +jint Java_org_rocksdb_CompressionOptions_windowBits(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->window_bits); @@ -51,7 +51,7 @@ jint Java_org_rocksdb_CompressionOptions_windowBits(JNIEnv*, jobject, * Method: setLevel * Signature: (JI)V */ -void Java_org_rocksdb_CompressionOptions_setLevel(JNIEnv*, jobject, +void Java_org_rocksdb_CompressionOptions_setLevel(JNIEnv*, jclass, jlong jhandle, jint jlevel) { auto* opt = reinterpret_cast(jhandle); opt->level = static_cast(jlevel); @@ -62,8 +62,7 @@ void Java_org_rocksdb_CompressionOptions_setLevel(JNIEnv*, jobject, * Method: level * Signature: (J)I */ -jint Java_org_rocksdb_CompressionOptions_level(JNIEnv*, jobject, - jlong jhandle) { +jint Java_org_rocksdb_CompressionOptions_level(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->level); } @@ -73,7 +72,7 @@ jint Java_org_rocksdb_CompressionOptions_level(JNIEnv*, jobject, * Method: setStrategy * Signature: (JI)V */ -void Java_org_rocksdb_CompressionOptions_setStrategy(JNIEnv*, jobject, +void Java_org_rocksdb_CompressionOptions_setStrategy(JNIEnv*, jclass, jlong jhandle, jint jstrategy) { auto* opt = reinterpret_cast(jhandle); @@ -85,7 +84,7 @@ void Java_org_rocksdb_CompressionOptions_setStrategy(JNIEnv*, jobject, * Method: strategy * Signature: (J)I */ -jint Java_org_rocksdb_CompressionOptions_strategy(JNIEnv*, jobject, +jint Java_org_rocksdb_CompressionOptions_strategy(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->strategy); @@ -96,7 +95,7 @@ jint Java_org_rocksdb_CompressionOptions_strategy(JNIEnv*, jobject, * Method: setMaxDictBytes * Signature: (JI)V */ -void Java_org_rocksdb_CompressionOptions_setMaxDictBytes(JNIEnv*, jobject, +void Java_org_rocksdb_CompressionOptions_setMaxDictBytes(JNIEnv*, jclass, jlong jhandle, jint jmax_dict_bytes) { auto* opt = reinterpret_cast(jhandle); @@ -108,7 +107,7 @@ void Java_org_rocksdb_CompressionOptions_setMaxDictBytes(JNIEnv*, jobject, * Method: maxDictBytes * Signature: (J)I */ -jint Java_org_rocksdb_CompressionOptions_maxDictBytes(JNIEnv*, jobject, +jint Java_org_rocksdb_CompressionOptions_maxDictBytes(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->max_dict_bytes); @@ -120,7 +119,7 @@ jint Java_org_rocksdb_CompressionOptions_maxDictBytes(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_CompressionOptions_setZstdMaxTrainBytes( - JNIEnv*, jobject, jlong jhandle, jint jzstd_max_train_bytes) { + JNIEnv*, jclass, jlong jhandle, jint jzstd_max_train_bytes) { auto* opt = reinterpret_cast(jhandle); opt->zstd_max_train_bytes = static_cast(jzstd_max_train_bytes); } @@ -130,7 +129,7 @@ void Java_org_rocksdb_CompressionOptions_setZstdMaxTrainBytes( * Method: zstdMaxTrainBytes * Signature: (J)I */ -jint Java_org_rocksdb_CompressionOptions_zstdMaxTrainBytes(JNIEnv*, jobject, +jint Java_org_rocksdb_CompressionOptions_zstdMaxTrainBytes(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->zstd_max_train_bytes); @@ -142,7 +141,7 @@ jint Java_org_rocksdb_CompressionOptions_zstdMaxTrainBytes(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_CompressionOptions_setMaxDictBufferBytes( - JNIEnv*, jobject, jlong jhandle, jlong jmax_dict_buffer_bytes) { + JNIEnv*, jclass, jlong jhandle, jlong jmax_dict_buffer_bytes) { auto* opt = reinterpret_cast(jhandle); opt->max_dict_buffer_bytes = static_cast(jmax_dict_buffer_bytes); } @@ -152,7 +151,7 @@ void Java_org_rocksdb_CompressionOptions_setMaxDictBufferBytes( * Method: maxDictBufferBytes * Signature: (J)J */ -jlong Java_org_rocksdb_CompressionOptions_maxDictBufferBytes(JNIEnv*, jobject, +jlong Java_org_rocksdb_CompressionOptions_maxDictBufferBytes(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->max_dict_buffer_bytes); @@ -164,7 +163,7 @@ jlong Java_org_rocksdb_CompressionOptions_maxDictBufferBytes(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_CompressionOptions_setUseZstdDictTrainer( - JNIEnv*, jobject, jlong jhandle, jboolean juse_zstd_dict_trainer) { + JNIEnv*, jclass, jlong jhandle, jboolean juse_zstd_dict_trainer) { auto* opt = reinterpret_cast(jhandle); opt->use_zstd_dict_trainer = juse_zstd_dict_trainer == JNI_TRUE; } @@ -174,8 +173,7 @@ void Java_org_rocksdb_CompressionOptions_setUseZstdDictTrainer( * Method: zstdMaxTrainBytes * Signature: (J)Z */ -jboolean Java_org_rocksdb_CompressionOptions_useZstdDictTrainer(JNIEnv*, - jobject, +jboolean Java_org_rocksdb_CompressionOptions_useZstdDictTrainer(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->use_zstd_dict_trainer); @@ -186,7 +184,7 @@ jboolean Java_org_rocksdb_CompressionOptions_useZstdDictTrainer(JNIEnv*, * Method: setEnabled * Signature: (JZ)V */ -void Java_org_rocksdb_CompressionOptions_setEnabled(JNIEnv*, jobject, +void Java_org_rocksdb_CompressionOptions_setEnabled(JNIEnv*, jclass, jlong jhandle, jboolean jenabled) { auto* opt = reinterpret_cast(jhandle); @@ -198,7 +196,7 @@ void Java_org_rocksdb_CompressionOptions_setEnabled(JNIEnv*, jobject, * Method: enabled * Signature: (J)Z */ -jboolean Java_org_rocksdb_CompressionOptions_enabled(JNIEnv*, jobject, +jboolean Java_org_rocksdb_CompressionOptions_enabled(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->enabled); @@ -208,7 +206,7 @@ jboolean Java_org_rocksdb_CompressionOptions_enabled(JNIEnv*, jobject, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompressionOptions_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_CompressionOptions_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { delete reinterpret_cast(jhandle); } diff --git a/java/rocksjni/concurrent_task_limiter.cc b/java/rocksjni/concurrent_task_limiter.cc index 0b0b2d27172..602fbd1190d 100644 --- a/java/rocksjni/concurrent_task_limiter.cc +++ b/java/rocksjni/concurrent_task_limiter.cc @@ -88,9 +88,8 @@ jint Java_org_rocksdb_ConcurrentTaskLimiterImpl_outstandingTask(JNIEnv*, jclass, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_ConcurrentTaskLimiterImpl_disposeInternal(JNIEnv*, - jobject, - jlong jhandle) { +void Java_org_rocksdb_ConcurrentTaskLimiterImpl_disposeInternalJni( + JNIEnv*, jclass, jlong jhandle) { auto* ptr = reinterpret_cast< std::shared_ptr*>(jhandle); delete ptr; // delete std::shared_ptr diff --git a/java/rocksjni/config_options.cc b/java/rocksjni/config_options.cc index 55a9cbb663d..1532dd9e80a 100644 --- a/java/rocksjni/config_options.cc +++ b/java/rocksjni/config_options.cc @@ -19,8 +19,8 @@ * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_ConfigOptions_disposeInternal(JNIEnv *, jobject, - jlong jhandle) { +void Java_org_rocksdb_ConfigOptions_disposeInternalJni(JNIEnv *, jclass, + jlong jhandle) { auto *co = reinterpret_cast(jhandle); assert(co != nullptr); delete co; diff --git a/java/rocksjni/env.cc b/java/rocksjni/env.cc index bb739fe2b42..a26b9f57585 100644 --- a/java/rocksjni/env.cc +++ b/java/rocksjni/env.cc @@ -33,8 +33,7 @@ jlong Java_org_rocksdb_Env_getDefaultEnvInternal(JNIEnv*, jclass) { * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_RocksEnv_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_RocksEnv_disposeInternal(JNIEnv*, jclass, jlong jhandle) { auto* e = reinterpret_cast(jhandle); assert(e != nullptr); delete e; @@ -45,7 +44,7 @@ void Java_org_rocksdb_RocksEnv_disposeInternal(JNIEnv*, jobject, * Method: setBackgroundThreads * Signature: (JIB)V */ -void Java_org_rocksdb_Env_setBackgroundThreads(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Env_setBackgroundThreads(JNIEnv*, jclass, jlong jhandle, jint jnum, jbyte jpriority_value) { auto* rocks_env = reinterpret_cast(jhandle); @@ -59,7 +58,7 @@ void Java_org_rocksdb_Env_setBackgroundThreads(JNIEnv*, jobject, jlong jhandle, * Method: getBackgroundThreads * Signature: (JB)I */ -jint Java_org_rocksdb_Env_getBackgroundThreads(JNIEnv*, jobject, jlong jhandle, +jint Java_org_rocksdb_Env_getBackgroundThreads(JNIEnv*, jclass, jlong jhandle, jbyte jpriority_value) { auto* rocks_env = reinterpret_cast(jhandle); const int num = rocks_env->GetBackgroundThreads( @@ -72,7 +71,7 @@ jint Java_org_rocksdb_Env_getBackgroundThreads(JNIEnv*, jobject, jlong jhandle, * Method: getThreadPoolQueueLen * Signature: (JB)I */ -jint Java_org_rocksdb_Env_getThreadPoolQueueLen(JNIEnv*, jobject, jlong jhandle, +jint Java_org_rocksdb_Env_getThreadPoolQueueLen(JNIEnv*, jclass, jlong jhandle, jbyte jpriority_value) { auto* rocks_env = reinterpret_cast(jhandle); const int queue_len = rocks_env->GetThreadPoolQueueLen( @@ -85,7 +84,7 @@ jint Java_org_rocksdb_Env_getThreadPoolQueueLen(JNIEnv*, jobject, jlong jhandle, * Method: incBackgroundThreadsIfNeeded * Signature: (JIB)V */ -void Java_org_rocksdb_Env_incBackgroundThreadsIfNeeded(JNIEnv*, jobject, +void Java_org_rocksdb_Env_incBackgroundThreadsIfNeeded(JNIEnv*, jclass, jlong jhandle, jint jnum, jbyte jpriority_value) { auto* rocks_env = reinterpret_cast(jhandle); @@ -99,7 +98,7 @@ void Java_org_rocksdb_Env_incBackgroundThreadsIfNeeded(JNIEnv*, jobject, * Method: lowerThreadPoolIOPriority * Signature: (JB)V */ -void Java_org_rocksdb_Env_lowerThreadPoolIOPriority(JNIEnv*, jobject, +void Java_org_rocksdb_Env_lowerThreadPoolIOPriority(JNIEnv*, jclass, jlong jhandle, jbyte jpriority_value) { auto* rocks_env = reinterpret_cast(jhandle); @@ -112,7 +111,7 @@ void Java_org_rocksdb_Env_lowerThreadPoolIOPriority(JNIEnv*, jobject, * Method: lowerThreadPoolCPUPriority * Signature: (JB)V */ -void Java_org_rocksdb_Env_lowerThreadPoolCPUPriority(JNIEnv*, jobject, +void Java_org_rocksdb_Env_lowerThreadPoolCPUPriority(JNIEnv*, jclass, jlong jhandle, jbyte jpriority_value) { auto* rocks_env = reinterpret_cast(jhandle); @@ -125,7 +124,7 @@ void Java_org_rocksdb_Env_lowerThreadPoolCPUPriority(JNIEnv*, jobject, * Method: getThreadList * Signature: (J)[Lorg/rocksdb/ThreadStatus; */ -jobjectArray Java_org_rocksdb_Env_getThreadList(JNIEnv* env, jobject, +jobjectArray Java_org_rocksdb_Env_getThreadList(JNIEnv* env, jclass, jlong jhandle) { auto* rocks_env = reinterpret_cast(jhandle); std::vector thread_status; @@ -174,8 +173,8 @@ jlong Java_org_rocksdb_RocksMemEnv_createMemEnv(JNIEnv*, jclass, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_RocksMemEnv_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_RocksMemEnv_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { auto* e = reinterpret_cast(jhandle); assert(e != nullptr); delete e; @@ -197,8 +196,8 @@ jlong Java_org_rocksdb_TimedEnv_createTimedEnv(JNIEnv*, jclass, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_TimedEnv_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_TimedEnv_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { auto* e = reinterpret_cast(jhandle); assert(e != nullptr); delete e; diff --git a/java/rocksjni/env_options.cc b/java/rocksjni/env_options.cc index 3237e277543..9312ec2d6b4 100644 --- a/java/rocksjni/env_options.cc +++ b/java/rocksjni/env_options.cc @@ -56,8 +56,8 @@ jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(JNIEnv *, jclass, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_EnvOptions_disposeInternal(JNIEnv *, jobject, - jlong jhandle) { +void Java_org_rocksdb_EnvOptions_disposeInternalJni(JNIEnv *, jclass, + jlong jhandle) { auto *eo = reinterpret_cast(jhandle); assert(eo != nullptr); delete eo; @@ -68,7 +68,7 @@ void Java_org_rocksdb_EnvOptions_disposeInternal(JNIEnv *, jobject, * Method: setUseMmapReads * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv *, jobject, +void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv *, jclass, jlong jhandle, jboolean use_mmap_reads) { ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_reads); @@ -79,7 +79,7 @@ void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv *, jobject, * Method: useMmapReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv *, jobject, +jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv *, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, use_mmap_reads); } @@ -89,7 +89,7 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv *, jobject, * Method: setUseMmapWrites * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv *, jobject, +void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv *, jclass, jlong jhandle, jboolean use_mmap_writes) { ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_writes); @@ -100,7 +100,7 @@ void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv *, jobject, * Method: useMmapWrites * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *, jobject, +jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, use_mmap_writes); } @@ -110,7 +110,7 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *, jobject, * Method: setUseDirectReads * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *, jobject, +void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *, jclass, jlong jhandle, jboolean use_direct_reads) { ENV_OPTIONS_SET_BOOL(jhandle, use_direct_reads); @@ -121,7 +121,7 @@ void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *, jobject, * Method: useDirectReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *, jobject, +jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, use_direct_reads); } @@ -132,7 +132,7 @@ jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_EnvOptions_setUseDirectWrites( - JNIEnv *, jobject, jlong jhandle, jboolean use_direct_writes) { + JNIEnv *, jclass, jlong jhandle, jboolean use_direct_writes) { ENV_OPTIONS_SET_BOOL(jhandle, use_direct_writes); } @@ -141,7 +141,7 @@ void Java_org_rocksdb_EnvOptions_setUseDirectWrites( * Method: useDirectWrites * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *, jobject, +jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, use_direct_writes); } @@ -151,7 +151,7 @@ jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *, jobject, * Method: setAllowFallocate * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *, jobject, +void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *, jclass, jlong jhandle, jboolean allow_fallocate) { ENV_OPTIONS_SET_BOOL(jhandle, allow_fallocate); @@ -162,7 +162,7 @@ void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *, jobject, * Method: allowFallocate * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *, jobject, +jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, allow_fallocate); } @@ -172,7 +172,7 @@ jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *, jobject, * Method: setSetFdCloexec * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *, jobject, +void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *, jclass, jlong jhandle, jboolean set_fd_cloexec) { ENV_OPTIONS_SET_BOOL(jhandle, set_fd_cloexec); @@ -183,7 +183,7 @@ void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *, jobject, * Method: setFdCloexec * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *, jobject, +jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, set_fd_cloexec); } @@ -193,7 +193,7 @@ jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *, jobject, * Method: setBytesPerSync * Signature: (JJ)V */ -void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *, jobject, +void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *, jclass, jlong jhandle, jlong bytes_per_sync) { ENV_OPTIONS_SET_UINT64_T(jhandle, bytes_per_sync); @@ -204,7 +204,7 @@ void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *, jobject, * Method: bytesPerSync * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *, jobject, +jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, bytes_per_sync); } @@ -215,7 +215,7 @@ jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize( - JNIEnv *, jobject, jlong jhandle, jboolean fallocate_with_keep_size) { + JNIEnv *, jclass, jlong jhandle, jboolean fallocate_with_keep_size) { ENV_OPTIONS_SET_BOOL(jhandle, fallocate_with_keep_size); } @@ -224,7 +224,7 @@ void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize( * Method: fallocateWithKeepSize * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *, jobject, +jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, fallocate_with_keep_size); } @@ -235,7 +235,7 @@ jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize( - JNIEnv *, jobject, jlong jhandle, jlong compaction_readahead_size) { + JNIEnv *, jclass, jlong jhandle, jlong compaction_readahead_size) { ENV_OPTIONS_SET_SIZE_T(jhandle, compaction_readahead_size); } @@ -244,7 +244,7 @@ void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize( * Method: compactionReadaheadSize * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *, jobject, +jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, compaction_readahead_size); } @@ -255,7 +255,7 @@ jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_EnvOptions_setRandomAccessMaxBufferSize( - JNIEnv *, jobject, jlong jhandle, jlong random_access_max_buffer_size) { + JNIEnv *, jclass, jlong jhandle, jlong random_access_max_buffer_size) { ENV_OPTIONS_SET_SIZE_T(jhandle, random_access_max_buffer_size); } @@ -264,7 +264,7 @@ void Java_org_rocksdb_EnvOptions_setRandomAccessMaxBufferSize( * Method: randomAccessMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_randomAccessMaxBufferSize(JNIEnv *, jobject, +jlong Java_org_rocksdb_EnvOptions_randomAccessMaxBufferSize(JNIEnv *, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, random_access_max_buffer_size); } @@ -275,7 +275,7 @@ jlong Java_org_rocksdb_EnvOptions_randomAccessMaxBufferSize(JNIEnv *, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize( - JNIEnv *, jobject, jlong jhandle, jlong writable_file_max_buffer_size) { + JNIEnv *, jclass, jlong jhandle, jlong writable_file_max_buffer_size) { ENV_OPTIONS_SET_SIZE_T(jhandle, writable_file_max_buffer_size); } @@ -284,7 +284,7 @@ void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize( * Method: writableFileMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *, jobject, +jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, writable_file_max_buffer_size); } @@ -294,8 +294,7 @@ jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *, jobject, * Method: setRateLimiter * Signature: (JJ)V */ -void Java_org_rocksdb_EnvOptions_setRateLimiter(JNIEnv *, jobject, - jlong jhandle, +void Java_org_rocksdb_EnvOptions_setRateLimiter(JNIEnv *, jclass, jlong jhandle, jlong rl_handle) { auto *sptr_rate_limiter = reinterpret_cast *>( diff --git a/java/rocksjni/filter.cc b/java/rocksjni/filter.cc index ed22016d237..a71401cdfbd 100644 --- a/java/rocksjni/filter.cc +++ b/java/rocksjni/filter.cc @@ -37,8 +37,9 @@ jlong Java_org_rocksdb_BloomFilter_createNewBloomFilter(JNIEnv* /*env*/, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_Filter_disposeInternal(JNIEnv* /*env*/, jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_Filter_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jhandle) { auto* handle = reinterpret_cast*>( jhandle); diff --git a/java/rocksjni/ingest_external_file_options.cc b/java/rocksjni/ingest_external_file_options.cc index 052cf33256e..2a9b73b1689 100644 --- a/java/rocksjni/ingest_external_file_options.cc +++ b/java/rocksjni/ingest_external_file_options.cc @@ -44,7 +44,7 @@ jlong Java_org_rocksdb_IngestExternalFileOptions_newIngestExternalFileOptions__Z * Method: moveFiles * Signature: (J)Z */ -jboolean Java_org_rocksdb_IngestExternalFileOptions_moveFiles(JNIEnv*, jobject, +jboolean Java_org_rocksdb_IngestExternalFileOptions_moveFiles(JNIEnv*, jclass, jlong jhandle) { auto* options = reinterpret_cast(jhandle); @@ -57,7 +57,7 @@ jboolean Java_org_rocksdb_IngestExternalFileOptions_moveFiles(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_IngestExternalFileOptions_setMoveFiles( - JNIEnv*, jobject, jlong jhandle, jboolean jmove_files) { + JNIEnv*, jclass, jlong jhandle, jboolean jmove_files) { auto* options = reinterpret_cast(jhandle); options->move_files = static_cast(jmove_files); @@ -69,7 +69,7 @@ void Java_org_rocksdb_IngestExternalFileOptions_setMoveFiles( * Signature: (J)Z */ jboolean Java_org_rocksdb_IngestExternalFileOptions_snapshotConsistency( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* options = reinterpret_cast(jhandle); return static_cast(options->snapshot_consistency); @@ -81,7 +81,7 @@ jboolean Java_org_rocksdb_IngestExternalFileOptions_snapshotConsistency( * Signature: (JZ)V */ void Java_org_rocksdb_IngestExternalFileOptions_setSnapshotConsistency( - JNIEnv*, jobject, jlong jhandle, jboolean jsnapshot_consistency) { + JNIEnv*, jclass, jlong jhandle, jboolean jsnapshot_consistency) { auto* options = reinterpret_cast(jhandle); options->snapshot_consistency = static_cast(jsnapshot_consistency); @@ -93,7 +93,7 @@ void Java_org_rocksdb_IngestExternalFileOptions_setSnapshotConsistency( * Signature: (J)Z */ jboolean Java_org_rocksdb_IngestExternalFileOptions_allowGlobalSeqNo( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* options = reinterpret_cast(jhandle); return static_cast(options->allow_global_seqno); @@ -105,7 +105,7 @@ jboolean Java_org_rocksdb_IngestExternalFileOptions_allowGlobalSeqNo( * Signature: (JZ)V */ void Java_org_rocksdb_IngestExternalFileOptions_setAllowGlobalSeqNo( - JNIEnv*, jobject, jlong jhandle, jboolean jallow_global_seqno) { + JNIEnv*, jclass, jlong jhandle, jboolean jallow_global_seqno) { auto* options = reinterpret_cast(jhandle); options->allow_global_seqno = static_cast(jallow_global_seqno); @@ -117,7 +117,7 @@ void Java_org_rocksdb_IngestExternalFileOptions_setAllowGlobalSeqNo( * Signature: (J)Z */ jboolean Java_org_rocksdb_IngestExternalFileOptions_allowBlockingFlush( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* options = reinterpret_cast(jhandle); return static_cast(options->allow_blocking_flush); @@ -129,7 +129,7 @@ jboolean Java_org_rocksdb_IngestExternalFileOptions_allowBlockingFlush( * Signature: (JZ)V */ void Java_org_rocksdb_IngestExternalFileOptions_setAllowBlockingFlush( - JNIEnv*, jobject, jlong jhandle, jboolean jallow_blocking_flush) { + JNIEnv*, jclass, jlong jhandle, jboolean jallow_blocking_flush) { auto* options = reinterpret_cast(jhandle); options->allow_blocking_flush = static_cast(jallow_blocking_flush); @@ -141,7 +141,7 @@ void Java_org_rocksdb_IngestExternalFileOptions_setAllowBlockingFlush( * Signature: (J)Z */ jboolean Java_org_rocksdb_IngestExternalFileOptions_ingestBehind( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* options = reinterpret_cast(jhandle); return options->ingest_behind == JNI_TRUE; @@ -153,7 +153,7 @@ jboolean Java_org_rocksdb_IngestExternalFileOptions_ingestBehind( * Signature: (JZ)V */ void Java_org_rocksdb_IngestExternalFileOptions_setIngestBehind( - JNIEnv*, jobject, jlong jhandle, jboolean jingest_behind) { + JNIEnv*, jclass, jlong jhandle, jboolean jingest_behind) { auto* options = reinterpret_cast(jhandle); options->ingest_behind = jingest_behind == JNI_TRUE; @@ -165,7 +165,7 @@ void Java_org_rocksdb_IngestExternalFileOptions_setIngestBehind( * Signature: (J)Z */ JNIEXPORT jboolean JNICALL -Java_org_rocksdb_IngestExternalFileOptions_writeGlobalSeqno(JNIEnv*, jobject, +Java_org_rocksdb_IngestExternalFileOptions_writeGlobalSeqno(JNIEnv*, jclass, jlong jhandle) { auto* options = reinterpret_cast(jhandle); @@ -179,7 +179,7 @@ Java_org_rocksdb_IngestExternalFileOptions_writeGlobalSeqno(JNIEnv*, jobject, */ JNIEXPORT void JNICALL Java_org_rocksdb_IngestExternalFileOptions_setWriteGlobalSeqno( - JNIEnv*, jobject, jlong jhandle, jboolean jwrite_global_seqno) { + JNIEnv*, jclass, jlong jhandle, jboolean jwrite_global_seqno) { auto* options = reinterpret_cast(jhandle); options->write_global_seqno = jwrite_global_seqno == JNI_TRUE; @@ -190,9 +190,8 @@ Java_org_rocksdb_IngestExternalFileOptions_setWriteGlobalSeqno( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_IngestExternalFileOptions_disposeInternal(JNIEnv*, - jobject, - jlong jhandle) { +void Java_org_rocksdb_IngestExternalFileOptions_disposeInternalJni( + JNIEnv*, jclass, jlong jhandle) { auto* options = reinterpret_cast(jhandle); delete options; diff --git a/java/rocksjni/iterator.cc b/java/rocksjni/iterator.cc index 3ddb9778bcb..95d94a874ad 100644 --- a/java/rocksjni/iterator.cc +++ b/java/rocksjni/iterator.cc @@ -22,9 +22,9 @@ * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_RocksIterator_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_RocksIterator_disposeInternalJni(JNIEnv* /*env*/, + jclass /*cls*/, + jlong handle) { auto* it = reinterpret_cast(handle); assert(it != nullptr); delete it; @@ -35,9 +35,9 @@ void Java_org_rocksdb_RocksIterator_disposeInternal(JNIEnv* /*env*/, * Method: isValid0 * Signature: (J)Z */ -jboolean Java_org_rocksdb_RocksIterator_isValid0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +jboolean Java_org_rocksdb_RocksIterator_isValid0Jni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong handle) { return reinterpret_cast(handle)->Valid(); } @@ -46,9 +46,9 @@ jboolean Java_org_rocksdb_RocksIterator_isValid0(JNIEnv* /*env*/, * Method: seekToFirst0 * Signature: (J)V */ -void Java_org_rocksdb_RocksIterator_seekToFirst0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_RocksIterator_seekToFirst0Jni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong handle) { reinterpret_cast(handle)->SeekToFirst(); } @@ -57,9 +57,9 @@ void Java_org_rocksdb_RocksIterator_seekToFirst0(JNIEnv* /*env*/, * Method: seekToLast0 * Signature: (J)V */ -void Java_org_rocksdb_RocksIterator_seekToLast0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_RocksIterator_seekToLast0Jni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong handle) { reinterpret_cast(handle)->SeekToLast(); } @@ -68,8 +68,8 @@ void Java_org_rocksdb_RocksIterator_seekToLast0(JNIEnv* /*env*/, * Method: next0 * Signature: (J)V */ -void Java_org_rocksdb_RocksIterator_next0(JNIEnv* /*env*/, jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_RocksIterator_next0Jni(JNIEnv* /*env*/, jclass /*jcls*/, + jlong handle) { reinterpret_cast(handle)->Next(); } @@ -78,8 +78,8 @@ void Java_org_rocksdb_RocksIterator_next0(JNIEnv* /*env*/, jobject /*jobj*/, * Method: prev0 * Signature: (J)V */ -void Java_org_rocksdb_RocksIterator_prev0(JNIEnv* /*env*/, jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_RocksIterator_prev0Jni(JNIEnv* /*env*/, jclass /*jobj*/, + jlong handle) { reinterpret_cast(handle)->Prev(); } @@ -88,8 +88,8 @@ void Java_org_rocksdb_RocksIterator_prev0(JNIEnv* /*env*/, jobject /*jobj*/, * Method: refresh0 * Signature: (J)V */ -void Java_org_rocksdb_RocksIterator_refresh0(JNIEnv* env, jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_RocksIterator_refresh0Jni(JNIEnv* env, jclass /*jcls*/, + jlong handle) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Status s = it->Refresh(); @@ -100,14 +100,34 @@ void Java_org_rocksdb_RocksIterator_refresh0(JNIEnv* env, jobject /*jobj*/, ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); } +/* + * Class: org_rocksdb_RocksIterator + * Method: refresh1 + * Signature: (JJ)V + */ +void Java_org_rocksdb_RocksIterator_refresh1(JNIEnv* env, jobject /*jobj*/, + jlong handle, + jlong snapshot_handle) { + auto* it = reinterpret_cast(handle); + auto* snapshot = + reinterpret_cast(snapshot_handle); + ROCKSDB_NAMESPACE::Status s = it->Refresh(snapshot); + + if (s.ok()) { + return; + } + + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); +} + /* * Class: org_rocksdb_RocksIterator * Method: seek0 * Signature: (J[BI)V */ -void Java_org_rocksdb_RocksIterator_seek0(JNIEnv* env, jobject /*jobj*/, - jlong handle, jbyteArray jtarget, - jint jtarget_len) { +void Java_org_rocksdb_RocksIterator_seek0Jni(JNIEnv* env, jclass /*jcls*/, + jlong handle, jbyteArray jtarget, + jint jtarget_len) { auto* it = reinterpret_cast(handle); auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) { it->Seek(target_slice); @@ -124,8 +144,8 @@ void Java_org_rocksdb_RocksIterator_seek0(JNIEnv* env, jobject /*jobj*/, * Method: seek0 * Signature: (J[BII)V */ -void Java_org_rocksdb_RocksIterator_seekByteArray0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jtarget, +void Java_org_rocksdb_RocksIterator_seekByteArray0Jni( + JNIEnv* env, jclass /*jcls*/, jlong handle, jbyteArray jtarget, jint jtarget_off, jint jtarget_len) { auto* it = reinterpret_cast(handle); auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) { @@ -140,10 +160,11 @@ void Java_org_rocksdb_RocksIterator_seekByteArray0( * Method: seekDirect0 * Signature: (JLjava/nio/ByteBuffer;II)V */ -void Java_org_rocksdb_RocksIterator_seekDirect0(JNIEnv* env, jobject /*jobj*/, - jlong handle, jobject jtarget, - jint jtarget_off, - jint jtarget_len) { +void Java_org_rocksdb_RocksIterator_seekDirect0Jni(JNIEnv* env, jclass /*jobj*/, + jlong handle, + jobject jtarget, + jint jtarget_off, + jint jtarget_len) { auto* it = reinterpret_cast(handle); auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) { it->Seek(target_slice); @@ -157,8 +178,8 @@ void Java_org_rocksdb_RocksIterator_seekDirect0(JNIEnv* env, jobject /*jobj*/, * Method: seekForPrevDirect0 * Signature: (JLjava/nio/ByteBuffer;II)V */ -void Java_org_rocksdb_RocksIterator_seekForPrevDirect0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget, +void Java_org_rocksdb_RocksIterator_seekForPrevDirect0Jni( + JNIEnv* env, jclass /*jcls*/, jlong handle, jobject jtarget, jint jtarget_off, jint jtarget_len) { auto* it = reinterpret_cast(handle); auto seekPrev = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) { @@ -173,10 +194,11 @@ void Java_org_rocksdb_RocksIterator_seekForPrevDirect0( * Method: seekForPrev0 * Signature: (J[BI)V */ -void Java_org_rocksdb_RocksIterator_seekForPrev0(JNIEnv* env, jobject /*jobj*/, - jlong handle, - jbyteArray jtarget, - jint jtarget_len) { +void Java_org_rocksdb_RocksIterator_seekForPrev0Jni(JNIEnv* env, + jclass /*jcls*/, + jlong handle, + jbyteArray jtarget, + jint jtarget_len) { auto* it = reinterpret_cast(handle); auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) { it->SeekForPrev(target_slice); @@ -193,8 +215,8 @@ void Java_org_rocksdb_RocksIterator_seekForPrev0(JNIEnv* env, jobject /*jobj*/, * Method: seek0 * Signature: (J[BII)V */ -void Java_org_rocksdb_RocksIterator_seekForPrevByteArray0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jtarget, +void Java_org_rocksdb_RocksIterator_seekForPrevByteArray0Jni( + JNIEnv* env, jclass /*jcls*/, jlong handle, jbyteArray jtarget, jint jtarget_off, jint jtarget_len) { auto* it = reinterpret_cast(handle); auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) { @@ -209,8 +231,8 @@ void Java_org_rocksdb_RocksIterator_seekForPrevByteArray0( * Method: status0 * Signature: (J)V */ -void Java_org_rocksdb_RocksIterator_status0(JNIEnv* env, jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_RocksIterator_status0Jni(JNIEnv* env, jclass /*jcls*/, + jlong handle) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Status s = it->status(); @@ -226,7 +248,7 @@ void Java_org_rocksdb_RocksIterator_status0(JNIEnv* env, jobject /*jobj*/, * Method: key0 * Signature: (J)[B */ -jbyteArray Java_org_rocksdb_RocksIterator_key0(JNIEnv* env, jobject /*jobj*/, +jbyteArray Java_org_rocksdb_RocksIterator_key0(JNIEnv* env, jclass /*jcls*/, jlong handle) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Slice key_slice = it->key(); @@ -247,7 +269,7 @@ jbyteArray Java_org_rocksdb_RocksIterator_key0(JNIEnv* env, jobject /*jobj*/, * Method: keyDirect0 * Signature: (JLjava/nio/ByteBuffer;II)I */ -jint Java_org_rocksdb_RocksIterator_keyDirect0(JNIEnv* env, jobject /*jobj*/, +jint Java_org_rocksdb_RocksIterator_keyDirect0(JNIEnv* env, jclass /*jcls*/, jlong handle, jobject jtarget, jint jtarget_off, jint jtarget_len) { @@ -265,7 +287,7 @@ jint Java_org_rocksdb_RocksIterator_keyDirect0(JNIEnv* env, jobject /*jobj*/, * Method: keyByteArray0 * Signature: (J[BII)I */ -jint Java_org_rocksdb_RocksIterator_keyByteArray0(JNIEnv* env, jobject /*jobj*/, +jint Java_org_rocksdb_RocksIterator_keyByteArray0(JNIEnv* env, jclass /*jcls*/, jlong handle, jbyteArray jkey, jint jkey_off, jint jkey_len) { @@ -285,7 +307,7 @@ jint Java_org_rocksdb_RocksIterator_keyByteArray0(JNIEnv* env, jobject /*jobj*/, * Method: value0 * Signature: (J)[B */ -jbyteArray Java_org_rocksdb_RocksIterator_value0(JNIEnv* env, jobject /*jobj*/, +jbyteArray Java_org_rocksdb_RocksIterator_value0(JNIEnv* env, jclass /*jcls*/, jlong handle) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Slice value_slice = it->value(); @@ -307,7 +329,7 @@ jbyteArray Java_org_rocksdb_RocksIterator_value0(JNIEnv* env, jobject /*jobj*/, * Method: valueDirect0 * Signature: (JLjava/nio/ByteBuffer;II)I */ -jint Java_org_rocksdb_RocksIterator_valueDirect0(JNIEnv* env, jobject /*jobj*/, +jint Java_org_rocksdb_RocksIterator_valueDirect0(JNIEnv* env, jclass /*jcls*/, jlong handle, jobject jtarget, jint jtarget_off, jint jtarget_len) { @@ -326,7 +348,7 @@ jint Java_org_rocksdb_RocksIterator_valueDirect0(JNIEnv* env, jobject /*jobj*/, * Signature: (J[BII)I */ jint Java_org_rocksdb_RocksIterator_valueByteArray0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jvalue_target, + JNIEnv* env, jclass /*jcls*/, jlong handle, jbyteArray jvalue_target, jint jvalue_off, jint jvalue_len) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Slice value_slice = it->value(); diff --git a/java/rocksjni/jni_multiget_helpers.cc b/java/rocksjni/jni_multiget_helpers.cc new file mode 100644 index 00000000000..99994ac7cce --- /dev/null +++ b/java/rocksjni/jni_multiget_helpers.cc @@ -0,0 +1,290 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksjni/jni_multiget_helpers.h" + +#include "jni_multiget_helpers.h" +#include "rocksjni/portal.h" + +namespace ROCKSDB_NAMESPACE { + +bool MultiGetJNIKeys::fromByteArrays(JNIEnv* env, jobjectArray jkeys) { + const jsize num_keys = env->GetArrayLength(jkeys); + + for (jsize i = 0; i < num_keys; i++) { + jobject jkey = env->GetObjectArrayElement(jkeys, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + return false; + } + + jbyteArray jkey_ba = reinterpret_cast(jkey); + const jsize len_key = env->GetArrayLength(jkey_ba); + std::unique_ptr key = std::make_unique(len_key); + jbyte* raw_key = reinterpret_cast(key.get()); + key_bufs.push_back(std::move(key)); + env->GetByteArrayRegion(jkey_ba, 0, len_key, raw_key); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jkey); + return false; + } + + slices_.push_back( + ROCKSDB_NAMESPACE::Slice(reinterpret_cast(raw_key), len_key)); + env->DeleteLocalRef(jkey); + } + + return true; +} + +bool MultiGetJNIKeys::fromByteArrays(JNIEnv* env, jobjectArray jkeys, + jintArray jkey_offs, jintArray jkey_lens) { + const jsize num_keys = env->GetArrayLength(jkeys); + + std::unique_ptr key_offs = std::make_unique(num_keys); + env->GetIntArrayRegion(jkey_offs, 0, num_keys, key_offs.get()); + if (env->ExceptionCheck()) { + return false; // exception thrown: ArrayIndexOutOfBoundsException + } + + std::unique_ptr key_lens = std::make_unique(num_keys); + env->GetIntArrayRegion(jkey_lens, 0, num_keys, key_lens.get()); + if (env->ExceptionCheck()) { + return false; // exception thrown: ArrayIndexOutOfBoundsException + } + + for (jsize i = 0; i < num_keys; i++) { + jobject jkey = env->GetObjectArrayElement(jkeys, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + return false; + } + + jbyteArray jkey_ba = reinterpret_cast(jkey); + const jint len_key = key_lens[i]; + std::unique_ptr key = std::make_unique(len_key); + jbyte* raw_key = reinterpret_cast(key.get()); + key_bufs.push_back(std::move(key)); + env->GetByteArrayRegion(jkey_ba, key_offs[i], len_key, raw_key); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jkey); + return false; + } + + slices_.push_back( + ROCKSDB_NAMESPACE::Slice(reinterpret_cast(raw_key), len_key)); + env->DeleteLocalRef(jkey); + } + return true; +} + +bool MultiGetJNIKeys::fromByteBuffers(JNIEnv* env, jobjectArray jkeys, + jintArray jkey_offs, + jintArray jkey_lens) { + const jsize num_keys = env->GetArrayLength(jkeys); + + std::unique_ptr key_offs = std::make_unique(num_keys); + env->GetIntArrayRegion(jkey_offs, 0, num_keys, key_offs.get()); + if (env->ExceptionCheck()) { + return false; // exception thrown: ArrayIndexOutOfBoundsException + } + + std::unique_ptr key_lens = std::make_unique(num_keys); + env->GetIntArrayRegion(jkey_lens, 0, num_keys, key_lens.get()); + if (env->ExceptionCheck()) { + return false; // exception thrown: ArrayIndexOutOfBoundsException + } + + for (jsize i = 0; i < num_keys; i++) { + jobject jkey = env->GetObjectArrayElement(jkeys, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + return false; + } + + char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); + ROCKSDB_NAMESPACE::Slice key_slice(key + key_offs[i], key_lens[i]); + slices_.push_back(key_slice); + + env->DeleteLocalRef(jkey); + } + return true; +} + +ROCKSDB_NAMESPACE::Slice* MultiGetJNIKeys::data() { return slices_.data(); } + +std::vector::size_type MultiGetJNIKeys::size() { + return slices_.size(); +} + +template +jobjectArray MultiGetJNIValues::byteArrays( + JNIEnv* env, std::vector& values, + std::vector& s) { + jobjectArray jresults = ROCKSDB_NAMESPACE::ByteJni::new2dByteArray( + env, static_cast(s.size())); + if (jresults == nullptr) { + // exception occurred + OutOfMemoryErrorJni::ThrowNew(env, "Insufficient Memory for results."); + return nullptr; + } + + // add to the jresults + for (std::vector::size_type i = 0; i != s.size(); + i++) { + if (s[i].ok()) { + TValue* value = &values[i]; + jbyteArray jentry_value = + ROCKSDB_NAMESPACE::JniUtil::createJavaByteArrayWithSizeCheck( + env, value->data(), value->size()); + if (jentry_value == nullptr) { + // exception set + return nullptr; + } + + env->SetObjectArrayElement(jresults, static_cast(i), jentry_value); + if (env->ExceptionCheck()) { + // exception thrown: + // ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jentry_value); + return nullptr; + } + + env->DeleteLocalRef(jentry_value); + } else if (s[i].code() != ROCKSDB_NAMESPACE::Status::Code::kNotFound) { + // The only way to return an error for a single key is to exception the + // entire multiGet() Previous behaviour was to return a nullptr value for + // this case and potentially succesfully return values for other keys; we + // retain this behaviour. To change it, we need to do the following: + // ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s[i]); + // return nullptr; + } + } + return jresults; +} + +template jobjectArray MultiGetJNIValues::byteArrays( + JNIEnv* env, std::vector& values, + std::vector& s); + +template jobjectArray +MultiGetJNIValues::byteArrays( + JNIEnv* env, std::vector& values, + std::vector& s); + +template +void MultiGetJNIValues::fillByteBuffersAndStatusObjects( + JNIEnv* env, std::vector& values, + std::vector& s, jobjectArray jvalues, + jintArray jvalue_sizes, jobjectArray jstatuses) { + std::vector value_size; + for (int i = 0; i < static_cast(values.size()); i++) { + auto jstatus = ROCKSDB_NAMESPACE::StatusJni::construct(env, s[i]); + if (jstatus == nullptr) { + // exception in context + return; + } + env->SetObjectArrayElement(jstatuses, i, jstatus); + + if (s[i].ok()) { + jobject jvalue_bytebuf = env->GetObjectArrayElement(jvalues, i); + if (env->ExceptionCheck()) { + // ArrayIndexOutOfBoundsException is thrown + return; + } + jlong jvalue_capacity = env->GetDirectBufferCapacity(jvalue_bytebuf); + if (jvalue_capacity == -1) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, + "Invalid value(s) argument (argument is not a valid direct " + "ByteBuffer)"); + return; + } + void* jvalue_address = env->GetDirectBufferAddress(jvalue_bytebuf); + if (jvalue_address == nullptr) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, + "Invalid value(s) argument (argument is not a valid direct " + "ByteBuffer)"); + return; + } + + // record num returned, push back that number, which may be bigger then + // the ByteBuffer supplied. then copy as much as fits in the ByteBuffer. + static const size_t INTEGER_MAX_VALUE = + ((static_cast(1)) << 31) - 1; + if (values[i].size() > INTEGER_MAX_VALUE) { + // Indicate that the result size is bigger than can be represented in a + // java integer by setting the status to incomplete and the size to -1 + env->SetObjectArrayElement( + jstatuses, i, + ROCKSDB_NAMESPACE::StatusJni::construct( + env, Status::Incomplete("result too large to represent"))); + value_size.push_back(-1); + } else { + value_size.push_back(static_cast(values[i].size())); + } + auto copy_bytes = + std::min(static_cast(values[i].size()), jvalue_capacity); + memcpy(jvalue_address, values[i].data(), copy_bytes); + } else { + // bad status for this + value_size.push_back(0); + } + } + + env->SetIntArrayRegion(jvalue_sizes, 0, static_cast(values.size()), + value_size.data()); +} + +template void MultiGetJNIValues::fillByteBuffersAndStatusObjects< + ROCKSDB_NAMESPACE::PinnableSlice>( + JNIEnv* env, std::vector& values, + std::vector& s, jobjectArray jvalues, + jintArray jvalue_sizes, jobjectArray jstatuses); + +std::unique_ptr> +ColumnFamilyJNIHelpers::handlesFromJLongArray( + JNIEnv* env, jlongArray jcolumn_family_handles) { + if (jcolumn_family_handles == nullptr) return nullptr; + + const jsize num_cols = env->GetArrayLength(jcolumn_family_handles); + std::unique_ptr jcf_handles = std::make_unique(num_cols); + env->GetLongArrayRegion(jcolumn_family_handles, 0, num_cols, + jcf_handles.get()); + if (env->ExceptionCheck()) + // ArrayIndexOutOfBoundsException + return nullptr; + auto cf_handles = + std::make_unique>(); + + for (jsize i = 0; i < num_cols; i++) { + auto* cf_handle = reinterpret_cast( + jcf_handles.get()[i]); + cf_handles->push_back(cf_handle); + } + + return cf_handles; +} + +ROCKSDB_NAMESPACE::ColumnFamilyHandle* ColumnFamilyJNIHelpers::handleFromJLong( + JNIEnv* env, jlong jcolumn_family_handle) { + auto cf_handle = reinterpret_cast( + jcolumn_family_handle); + if (cf_handle == nullptr) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, ROCKSDB_NAMESPACE::Status::InvalidArgument( + "Invalid ColumnFamilyHandle.")); + return nullptr; + } + return cf_handle; +}; + +}; // namespace ROCKSDB_NAMESPACE diff --git a/java/rocksjni/jni_multiget_helpers.h b/java/rocksjni/jni_multiget_helpers.h new file mode 100644 index 00000000000..5ab23d6bc58 --- /dev/null +++ b/java/rocksjni/jni_multiget_helpers.h @@ -0,0 +1,163 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { + +/** + * @brief Encapsulate keys and key conversions from Java/JNI objects for + * MultiGet + * + */ +class MultiGetJNIKeys { + private: + std::vector slices_; + std::vector> key_bufs; + + public: + /** + * @brief Construct helper multiget keys object from array of java keys + * + * @param env JNI environment + * @param jkeys Array of `byte[]`, each of which contains a key + * @param jkey_offs array of offsets into keys, at which each key starts + * @param jkey_lens array of key lengths + * @return true if the keys were copied successfully from the parameters + * @return false if a Java exception was raised (memory problem, or array + * indexing problem) + */ + bool fromByteArrays(JNIEnv* env, jobjectArray jkeys, jintArray jkey_offs, + jintArray jkey_lens); + + /** + * @brief Construct helper multiget keys object from array of java keys + * + * @param env env JNI environment + * @param jkeys jkeys Array of byte[], each of which is a key + * @return true if the keys were copied successfully from the parameters + * @return false if a Java exception was raised (memory problem, or array + * indexing problem) + */ + bool fromByteArrays(JNIEnv* env, jobjectArray jkeys); + + /** + * @brief Construct helper multiget keys object from array of java ByteBuffers + * + * @param env JNI environment + * @param jkeys Array of `java.nio.ByteBuffer`, each of which contains a key + * @param jkey_offs array of offsets into buffers, at which each key starts + * @param jkey_lens array of key lengths + * @return `true` if the keys were copied successfully from the parameters + * @return `false` if a Java exception was raised (memory problem, or array + * indexing problem) + */ + bool fromByteBuffers(JNIEnv* env, jobjectArray jkeys, jintArray jkey_offs, + jintArray jkey_lens); + + /** + * @brief Used when the keys need to be passed to a RocksDB function which + * takes keys as an array of slice pointers + * + * @return ROCKSDB_NAMESPACE::Slice* an array of slices, the n-th slice + * contains the n-th key created by `fromByteArrays()` or `fromByteBuffers()` + */ + ROCKSDB_NAMESPACE::Slice* data(); + + /** + * @brief Used when the keys need to be passed to a RocksDB function which + * takes keys as a vector of slices + * + * @return std::vector& a vector of slices, the n-th + * slice contains the n-th key created by `fromByteArrays()` or + * `fromByteBuffers()` + */ + inline std::vector& slices() { return slices_; } + + /** + * @brief + * + * @return std::vector::size_type the number of keys + * in this object + */ + std::vector::size_type size(); +}; + +/** + * @brief Class with static helpers for returning java objects from RocksDB data + * returned by MultiGet + * + */ +class MultiGetJNIValues { + public: + /** + * @brief create an array of `byte[]` containing the result values from + * `MultiGet` + * + * @tparam TValue a `std::string` or a `PinnableSlice` containing the result + * for a single key + * @return jobjectArray an array of `byte[]`, one per value in the input + * vector + */ + template + static jobjectArray byteArrays(JNIEnv*, std::vector&, + std::vector&); + + /** + * @brief fill a supplied array of `byte[]` with the result values from + * `MultiGet` + * + * @tparam TValue a `std::string` or a `PinnableSlice` containing the result + * for a single key + * @param jvalues the array of `byte[]` to instantiate + * @param jvalue_sizes the offsets at which to place the results in `jvalues` + * @param jstatuses the status for every individual key/value get + */ + template + static void fillByteBuffersAndStatusObjects( + JNIEnv*, std::vector&, std::vector&, + jobjectArray jvalues, jintArray jvalue_sizes, jobjectArray jstatuses); +}; + +/** + * @brief class with static helper for arrays of column family handles + * + */ +class ColumnFamilyJNIHelpers { + public: + /** + * @brief create a native array of cf handles from java handles + * + * @param env + * @param jcolumn_family_handles + * @return unique ptr to vector of handles on success, reset() unique ptr on + * failure (and a JNI exception will be generated) + */ + static std::unique_ptr> + handlesFromJLongArray(JNIEnv* env, jlongArray jcolumn_family_handles); + + /** + * @brief create a column family handle from a raw pointer, or raise an + * appropriate JNI exception + * + * @param env + * @param jcolumn_family_handle the raw pointer to convert + * @return ROCKSDB_NAMESPACE::ColumnFamilyHandle* or raises a java exception + */ + static ROCKSDB_NAMESPACE::ColumnFamilyHandle* handleFromJLong( + JNIEnv* env, jlong jcolumn_family_handle); +}; + +}; // namespace ROCKSDB_NAMESPACE diff --git a/java/rocksjni/jni_perf_context.cc b/java/rocksjni/jni_perf_context.cc index e0124fdaa28..8ef5c7d369f 100644 --- a/java/rocksjni/jni_perf_context.cc +++ b/java/rocksjni/jni_perf_context.cc @@ -167,6 +167,54 @@ jlong Java_org_rocksdb_PerfContext_getBlockReadCpuTime(JNIEnv*, jobject, return perf_context->block_read_cpu_time; } +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockCacheIndexReadByte + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockCacheIndexReadByte( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_cache_index_read_byte; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockCacheFilterReadByte + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockCacheFilterReadByte( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_cache_filter_read_byte; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockCacheCompressionDictReadByte + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockCacheCompressionDictReadByte( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_cache_compression_dict_read_byte; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockCacheReadByte + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockCacheReadByte(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_cache_read_byte; +} + /* * Class: org_rocksdb_PerfContext * Method: getSecondaryCacheHitCount diff --git a/java/rocksjni/kv_helper.h b/java/rocksjni/kv_helper.h new file mode 100644 index 00000000000..f33073abe31 --- /dev/null +++ b/java/rocksjni/kv_helper.h @@ -0,0 +1,281 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file defines helper methods for Java API write methods +// + +#pragma once + +#include + +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksjni/portal.h" + +namespace ROCKSDB_NAMESPACE { + +/** + * @brief Exception class used to make the flow of key/value (Put(), Get(), + * Merge(), ...) calls clearer. + * + * This class is used by Java API JNI methods in try { save/fetch } catch { ... + * } style. + * + */ +class KVException : public std::exception { + public: + // These values are expected on Java API calls to represent the result of a + // Get() which has failed; a negative length is returned to indicate an error. + static const int kNotFound = -1; // the key was not found in RocksDB + static const int kStatusError = + -2; // there was some other error fetching the value for the key + + /** + * @brief Throw a KVException (and potentially a Java exception) if the + * RocksDB status is "bad" + * + * @param env JNI environment needed to create a Java exception + * @param status RocksDB status to examine + */ + static void ThrowOnError(JNIEnv* env, const Status& status) { + if (status.ok()) { + return; + } + if (status.IsNotFound()) { + // IsNotFound does not generate a Java Exception, any other bad status + // does.. + throw KVException(kNotFound); + } + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); + throw KVException(kStatusError); + } + + /** + * @brief Throw a KVException and a Java exception + * + * @param env JNI environment needed to create a Java exception + * @param message content of the exception we will throw + */ + static void ThrowNew(JNIEnv* env, const std::string& message) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, message); + throw KVException(kStatusError); + } + + /** + * @brief Throw a KVException if there is already a Java exception in the JNI + * enviroment + * + * @param env + */ + static void ThrowOnError(JNIEnv* env) { + if (env->ExceptionCheck()) { + throw KVException(kStatusError); + } + } + + KVException(jint code) : kCode_(code){}; + + virtual const char* what() const throw() { + return "Exception raised by JNI. There may be a Java exception in the " + "JNIEnv. Please check!"; + } + + jint Code() const { return kCode_; } + + private: + jint kCode_; +}; + +/** + * @brief Construct a slice with the contents of a Java byte array + * + * The slice refers to an array into which the Java byte array's whole region is + * copied + */ +class JByteArraySlice { + public: + JByteArraySlice(JNIEnv* env, const jbyteArray& jarr, const jint jarr_off, + const jint jarr_len) + : arr_(new jbyte[jarr_len]), + slice_(reinterpret_cast(arr_), jarr_len) { + env->GetByteArrayRegion(jarr, jarr_off, jarr_len, arr_); + KVException::ThrowOnError(env); + }; + + ~JByteArraySlice() { + slice_.clear(); + delete[] arr_; + }; + + Slice& slice() { return slice_; } + + private: + jbyte* arr_; + Slice slice_; +}; + +/** + * @brief Construct a slice with the contents of a direct Java ByterBuffer + * + * The slice refers directly to the contents of the buffer, no copy is made. + * + */ +class JDirectBufferSlice { + public: + JDirectBufferSlice(JNIEnv* env, const jobject& jbuffer, + const jint jbuffer_off, const jint jbuffer_len) + : slice_(static_cast(env->GetDirectBufferAddress(jbuffer)) + + jbuffer_off, + jbuffer_len) { + KVException::ThrowOnError(env); + jlong capacity = env->GetDirectBufferCapacity(jbuffer); + if (capacity < jbuffer_off + jbuffer_len) { + auto message = "Direct buffer offset " + std::to_string(jbuffer_off) + + " + length " + std::to_string(jbuffer_len) + + " exceeds capacity " + std::to_string(capacity); + KVException::ThrowNew(env, message); + slice_.clear(); + } + } + + ~JDirectBufferSlice() { slice_.clear(); }; + + Slice& slice() { return slice_; } + + private: + Slice slice_; +}; + +/** + * @brief Wrap a pinnable slice with a method to retrieve the contents back into + * Java + * + * The Java Byte Array version sets the byte array's region from the slice + */ +class JByteArrayPinnableSlice { + public: + /** + * @brief Construct a new JByteArrayPinnableSlice object referring to an + * existing java byte buffer + * + * @param env + * @param jbuffer + * @param jbuffer_off + * @param jbuffer_len + */ + JByteArrayPinnableSlice(JNIEnv* env, const jbyteArray& jbuffer, + const jint jbuffer_off, const jint jbuffer_len) + : env_(env), + jbuffer_(jbuffer), + jbuffer_off_(jbuffer_off), + jbuffer_len_(jbuffer_len){}; + + /** + * @brief Construct an empty new JByteArrayPinnableSlice object + * + */ + JByteArrayPinnableSlice(JNIEnv* env) : env_(env){}; + + PinnableSlice& pinnable_slice() { return pinnable_slice_; } + + ~JByteArrayPinnableSlice() { pinnable_slice_.Reset(); }; + + /** + * @brief copy back contents of the pinnable slice into the Java ByteBuffer + * + * @return jint min of size of buffer and number of bytes in value for + * requested key + */ + jint Fetch() { + const jint pinnable_len = static_cast(pinnable_slice_.size()); + const jint result_len = std::min(jbuffer_len_, pinnable_len); + env_->SetByteArrayRegion( + jbuffer_, jbuffer_off_, result_len, + reinterpret_cast(pinnable_slice_.data())); + KVException::ThrowOnError( + env_); // exception thrown: ArrayIndexOutOfBoundsException + + return pinnable_len; + }; + + /** + * @brief create a new Java buffer and copy the result into it + * + * @return jbyteArray the java buffer holding the result + */ + jbyteArray NewByteArray() { + const jint pinnable_len = static_cast(pinnable_slice_.size()); + jbyteArray jbuffer = + ROCKSDB_NAMESPACE::JniUtil::createJavaByteArrayWithSizeCheck( + env_, pinnable_slice_.data(), pinnable_len); + KVException::ThrowOnError(env_); // OutOfMemoryError + + return jbuffer; + } + + private: + JNIEnv* env_; + jbyteArray jbuffer_; + jint jbuffer_off_; + jint jbuffer_len_; + PinnableSlice pinnable_slice_; +}; + +/** + * @brief Wrap a pinnable slice with a method to retrieve the contents back into + * Java + * + * The Java Direct Buffer version copies the memory of the buffer from the slice + */ +class JDirectBufferPinnableSlice { + public: + JDirectBufferPinnableSlice(JNIEnv* env, const jobject& jbuffer, + const jint jbuffer_off, const jint jbuffer_len) + : buffer_(static_cast(env->GetDirectBufferAddress(jbuffer)) + + jbuffer_off), + jbuffer_len_(jbuffer_len) { + jlong capacity = env->GetDirectBufferCapacity(jbuffer); + if (capacity < jbuffer_off + jbuffer_len) { + auto message = + "Invalid value argument. Capacity is less than requested region. " + "offset " + + std::to_string(jbuffer_off) + " + length " + + std::to_string(jbuffer_len) + " exceeds capacity " + + std::to_string(capacity); + KVException::ThrowNew(env, message); + } + } + + PinnableSlice& pinnable_slice() { return pinnable_slice_; } + + ~JDirectBufferPinnableSlice() { pinnable_slice_.Reset(); }; + + /** + * @brief copy back contents of the pinnable slice into the Java DirectBuffer + * + * @return jint min of size of buffer and number of bytes in value for + * requested key + */ + jint Fetch() { + const jint pinnable_len = static_cast(pinnable_slice_.size()); + const jint result_len = std::min(jbuffer_len_, pinnable_len); + + memcpy(buffer_, pinnable_slice_.data(), result_len); + return pinnable_len; + }; + + private: + char* buffer_; + jint jbuffer_len_; + PinnableSlice pinnable_slice_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/java/rocksjni/loggerjnicallback.cc b/java/rocksjni/loggerjnicallback.cc index aa9f95cd4cf..28f64e638de 100644 --- a/java/rocksjni/loggerjnicallback.cc +++ b/java/rocksjni/loggerjnicallback.cc @@ -224,37 +224,15 @@ LoggerJniCallback::~LoggerJniCallback() { /* * Class: org_rocksdb_Logger - * Method: createNewLoggerOptions + * Method: newLogger * Signature: (J)J */ -jlong Java_org_rocksdb_Logger_createNewLoggerOptions(JNIEnv* env, jobject jobj, - jlong joptions) { +jlong Java_org_rocksdb_Logger_newLogger(JNIEnv* env, jobject jobj, + jlong jlog_level) { auto* sptr_logger = new std::shared_ptr( new ROCKSDB_NAMESPACE::LoggerJniCallback(env, jobj)); - - // set log level - auto* options = reinterpret_cast(joptions); - sptr_logger->get()->SetInfoLogLevel(options->info_log_level); - - return GET_CPLUSPLUS_POINTER(sptr_logger); -} - -/* - * Class: org_rocksdb_Logger - * Method: createNewLoggerDbOptions - * Signature: (J)J - */ -jlong Java_org_rocksdb_Logger_createNewLoggerDbOptions(JNIEnv* env, - jobject jobj, - jlong jdb_options) { - auto* sptr_logger = new std::shared_ptr( - new ROCKSDB_NAMESPACE::LoggerJniCallback(env, jobj)); - - // set log level - auto* db_options = - reinterpret_cast(jdb_options); - sptr_logger->get()->SetInfoLogLevel(db_options->info_log_level); - + auto log_level = static_cast(jlog_level); + sptr_logger->get()->SetInfoLogLevel(log_level); return GET_CPLUSPLUS_POINTER(sptr_logger); } diff --git a/java/rocksjni/lru_cache.cc b/java/rocksjni/lru_cache.cc index 56dffa2f0f2..1831ebc6abd 100644 --- a/java/rocksjni/lru_cache.cc +++ b/java/rocksjni/lru_cache.cc @@ -40,9 +40,9 @@ jlong Java_org_rocksdb_LRUCache_newLRUCache(JNIEnv* /*env*/, jclass /*jcls*/, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_LRUCache_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_LRUCache_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jhandle) { auto* sptr_lru_cache = reinterpret_cast*>(jhandle); delete sptr_lru_cache; // delete std::shared_ptr diff --git a/java/rocksjni/memtablejni.cc b/java/rocksjni/memtablejni.cc index a4d02f35491..d09d2c2de9e 100644 --- a/java/rocksjni/memtablejni.cc +++ b/java/rocksjni/memtablejni.cc @@ -19,7 +19,7 @@ * Signature: (JII)J */ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle( - JNIEnv* env, jobject /*jobj*/, jlong jbucket_count, jint jheight, + JNIEnv* env, jclass /*jcls*/, jlong jbucket_count, jint jheight, jint jbranching_factor) { ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(jbucket_count); @@ -38,7 +38,7 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle( * Signature: (JJIZI)J */ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle( - JNIEnv* env, jobject /*jobj*/, jlong jbucket_count, + JNIEnv* env, jclass /*jcls*/, jlong jbucket_count, jlong jhuge_page_tlb_size, jint jbucket_entries_logging_threshold, jboolean jif_log_bucket_dist_when_flash, jint jthreshold_use_skiplist) { ROCKSDB_NAMESPACE::Status statusBucketCount = @@ -65,7 +65,7 @@ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle( * Signature: (J)J */ jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle( - JNIEnv* env, jobject /*jobj*/, jlong jreserved_size) { + JNIEnv* env, jclass /*jcls*/, jlong jreserved_size) { ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(jreserved_size); if (s.ok()) { @@ -82,7 +82,7 @@ jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle( * Signature: (J)J */ jlong Java_org_rocksdb_SkipListMemTableConfig_newMemTableFactoryHandle0( - JNIEnv* env, jobject /*jobj*/, jlong jlookahead) { + JNIEnv* env, jclass /*jcls*/, jlong jlookahead) { ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(jlookahead); if (s.ok()) { diff --git a/java/rocksjni/merge_operator.cc b/java/rocksjni/merge_operator.cc index ce3c5df560d..5696a058604 100644 --- a/java/rocksjni/merge_operator.cc +++ b/java/rocksjni/merge_operator.cc @@ -61,9 +61,9 @@ jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator__Ljava * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_StringAppendOperator_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_StringAppendOperator_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jhandle) { auto* sptr_string_append_op = reinterpret_cast*>( jhandle); @@ -88,9 +88,9 @@ jlong Java_org_rocksdb_UInt64AddOperator_newSharedUInt64AddOperator( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_UInt64AddOperator_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_UInt64AddOperator_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jobj*/, + jlong jhandle) { auto* sptr_uint64_add_op = reinterpret_cast*>( jhandle); diff --git a/java/rocksjni/optimistic_transaction_db.cc b/java/rocksjni/optimistic_transaction_db.cc index 238224f588d..d4777135074 100644 --- a/java/rocksjni/optimistic_transaction_db.cc +++ b/java/rocksjni/optimistic_transaction_db.cc @@ -145,8 +145,8 @@ Java_org_rocksdb_OptimisticTransactionDB_open__JLjava_lang_String_2_3_3B_3J( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_OptimisticTransactionDB_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_OptimisticTransactionDB_disposeInternalJni( + JNIEnv*, jclass, jlong jhandle) { auto* optimistic_txn_db = reinterpret_cast(jhandle); assert(optimistic_txn_db != nullptr); @@ -173,7 +173,7 @@ void Java_org_rocksdb_OptimisticTransactionDB_closeDatabase(JNIEnv* env, jclass, * Signature: (JJ)J */ jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction__JJ( - JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jwrite_options_handle) { auto* optimistic_txn_db = reinterpret_cast(jhandle); auto* write_options = @@ -189,7 +189,7 @@ jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction__JJ( * Signature: (JJJ)J */ jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction__JJJ( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jwrite_options_handle, jlong joptimistic_txn_options_handle) { auto* optimistic_txn_db = reinterpret_cast(jhandle); @@ -209,7 +209,7 @@ jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction__JJJ( * Signature: (JJJ)J */ jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction_1withOld__JJJ( - JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle, + JNIEnv*, jclass, jlong jhandle, jlong jwrite_options_handle, jlong jold_txn_handle) { auto* optimistic_txn_db = reinterpret_cast(jhandle); @@ -235,7 +235,7 @@ jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction_1withOld__JJJ( * Signature: (JJJJ)J */ jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction_1withOld__JJJJ( - JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle, + JNIEnv*, jclass, jlong jhandle, jlong jwrite_options_handle, jlong joptimistic_txn_options_handle, jlong jold_txn_handle) { auto* optimistic_txn_db = reinterpret_cast(jhandle); @@ -262,7 +262,7 @@ jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction_1withOld__JJJJ( * Method: getBaseDB * Signature: (J)J */ -jlong Java_org_rocksdb_OptimisticTransactionDB_getBaseDB(JNIEnv*, jobject, +jlong Java_org_rocksdb_OptimisticTransactionDB_getBaseDB(JNIEnv*, jclass, jlong jhandle) { auto* optimistic_txn_db = reinterpret_cast(jhandle); diff --git a/java/rocksjni/optimistic_transaction_options.cc b/java/rocksjni/optimistic_transaction_options.cc index 501c6c4fbb4..336e4784575 100644 --- a/java/rocksjni/optimistic_transaction_options.cc +++ b/java/rocksjni/optimistic_transaction_options.cc @@ -31,7 +31,7 @@ jlong Java_org_rocksdb_OptimisticTransactionOptions_newOptimisticTransactionOpti * Signature: (J)Z */ jboolean Java_org_rocksdb_OptimisticTransactionOptions_isSetSnapshot( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* opts = reinterpret_cast( jhandle); @@ -44,7 +44,7 @@ jboolean Java_org_rocksdb_OptimisticTransactionOptions_isSetSnapshot( * Signature: (JZ)V */ void Java_org_rocksdb_OptimisticTransactionOptions_setSetSnapshot( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean jset_snapshot) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jboolean jset_snapshot) { auto* opts = reinterpret_cast( jhandle); @@ -57,8 +57,7 @@ void Java_org_rocksdb_OptimisticTransactionOptions_setSetSnapshot( * Signature: (JJ)V */ void Java_org_rocksdb_OptimisticTransactionOptions_setComparator( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, - jlong jcomparator_handle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jcomparator_handle) { auto* opts = reinterpret_cast( jhandle); @@ -71,8 +70,8 @@ void Java_org_rocksdb_OptimisticTransactionOptions_setComparator( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_OptimisticTransactionOptions_disposeInternal( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { +void Java_org_rocksdb_OptimisticTransactionOptions_disposeInternalJni( + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { delete reinterpret_cast( jhandle); } diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index 0d84901c917..d5dc2f0676f 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -37,6 +37,8 @@ #include "rocksjni/portal.h" #include "rocksjni/statisticsjni.h" #include "rocksjni/table_filter_jnicallback.h" +#include "rocksjni/table_properties_collector_factory.h" +#include "util/stderr_logger.h" #include "utilities/merge_operators.h" /* @@ -80,7 +82,8 @@ jlong Java_org_rocksdb_Options_copyOptions(JNIEnv*, jclass, jlong jhandle) { * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_Options_disposeInternal(JNIEnv*, jobject, jlong handle) { +void Java_org_rocksdb_Options_disposeInternalJni(JNIEnv*, jclass, + jlong handle) { auto* op = reinterpret_cast(handle); assert(op != nullptr); delete op; @@ -91,7 +94,7 @@ void Java_org_rocksdb_Options_disposeInternal(JNIEnv*, jobject, jlong handle) { * Method: setIncreaseParallelism * Signature: (JI)V */ -void Java_org_rocksdb_Options_setIncreaseParallelism(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setIncreaseParallelism(JNIEnv*, jclass, jlong jhandle, jint totalThreads) { reinterpret_cast(jhandle)->IncreaseParallelism( @@ -103,8 +106,8 @@ void Java_org_rocksdb_Options_setIncreaseParallelism(JNIEnv*, jobject, * Method: setCreateIfMissing * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setCreateIfMissing(JNIEnv*, jobject, - jlong jhandle, jboolean flag) { +void Java_org_rocksdb_Options_setCreateIfMissing(JNIEnv*, jclass, jlong jhandle, + jboolean flag) { reinterpret_cast(jhandle)->create_if_missing = flag; } @@ -114,7 +117,7 @@ void Java_org_rocksdb_Options_setCreateIfMissing(JNIEnv*, jobject, * Method: createIfMissing * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_createIfMissing(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_createIfMissing(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->create_if_missing; @@ -125,7 +128,7 @@ jboolean Java_org_rocksdb_Options_createIfMissing(JNIEnv*, jobject, * Method: setCreateMissingColumnFamilies * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setCreateMissingColumnFamilies(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setCreateMissingColumnFamilies(JNIEnv*, jclass, jlong jhandle, jboolean flag) { reinterpret_cast(jhandle) @@ -137,7 +140,7 @@ void Java_org_rocksdb_Options_setCreateMissingColumnFamilies(JNIEnv*, jobject, * Method: createMissingColumnFamilies * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_createMissingColumnFamilies(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_createMissingColumnFamilies(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->create_missing_column_families; @@ -148,7 +151,7 @@ jboolean Java_org_rocksdb_Options_createMissingColumnFamilies(JNIEnv*, jobject, * Method: setComparatorHandle * Signature: (JI)V */ -void Java_org_rocksdb_Options_setComparatorHandle__JI(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setComparatorHandle__JI(JNIEnv*, jclass, jlong jhandle, jint builtinComparator) { switch (builtinComparator) { @@ -168,7 +171,7 @@ void Java_org_rocksdb_Options_setComparatorHandle__JI(JNIEnv*, jobject, * Method: setComparatorHandle * Signature: (JJB)V */ -void Java_org_rocksdb_Options_setComparatorHandle__JJB(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setComparatorHandle__JJB(JNIEnv*, jclass, jlong jopt_handle, jlong jcomparator_handle, jbyte jcomparator_type) { @@ -195,7 +198,7 @@ void Java_org_rocksdb_Options_setComparatorHandle__JJB(JNIEnv*, jobject, * Method: setMergeOperatorName * Signature: (JJjava/lang/String)V */ -void Java_org_rocksdb_Options_setMergeOperatorName(JNIEnv* env, jobject, +void Java_org_rocksdb_Options_setMergeOperatorName(JNIEnv* env, jclass, jlong jhandle, jstring jop_name) { const char* op_name = env->GetStringUTFChars(jop_name, nullptr); @@ -216,7 +219,7 @@ void Java_org_rocksdb_Options_setMergeOperatorName(JNIEnv* env, jobject, * Method: setMergeOperator * Signature: (JJjava/lang/String)V */ -void Java_org_rocksdb_Options_setMergeOperator(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setMergeOperator(JNIEnv*, jclass, jlong jhandle, jlong mergeOperatorHandle) { reinterpret_cast(jhandle)->merge_operator = *(reinterpret_cast*>( @@ -229,7 +232,7 @@ void Java_org_rocksdb_Options_setMergeOperator(JNIEnv*, jobject, jlong jhandle, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setCompactionFilterHandle( - JNIEnv*, jobject, jlong jopt_handle, jlong jcompactionfilter_handle) { + JNIEnv*, jclass, jlong jopt_handle, jlong jcompactionfilter_handle) { reinterpret_cast(jopt_handle) ->compaction_filter = reinterpret_cast( @@ -242,8 +245,7 @@ void Java_org_rocksdb_Options_setCompactionFilterHandle( * Signature: (JJ)V */ void JNICALL Java_org_rocksdb_Options_setCompactionFilterFactoryHandle( - JNIEnv*, jobject, jlong jopt_handle, - jlong jcompactionfilterfactory_handle) { + JNIEnv*, jclass, jlong jopt_handle, jlong jcompactionfilterfactory_handle) { auto* cff_factory = reinterpret_cast< std::shared_ptr*>( jcompactionfilterfactory_handle); @@ -256,7 +258,7 @@ void JNICALL Java_org_rocksdb_Options_setCompactionFilterFactoryHandle( * Method: setWriteBufferSize * Signature: (JJ)I */ -void Java_org_rocksdb_Options_setWriteBufferSize(JNIEnv* env, jobject, +void Java_org_rocksdb_Options_setWriteBufferSize(JNIEnv* env, jclass, jlong jhandle, jlong jwrite_buffer_size) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( @@ -275,7 +277,7 @@ void Java_org_rocksdb_Options_setWriteBufferSize(JNIEnv* env, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setWriteBufferManager( - JNIEnv*, jobject, jlong joptions_handle, + JNIEnv*, jclass, jlong joptions_handle, jlong jwrite_buffer_manager_handle) { auto* write_buffer_manager = reinterpret_cast*>( @@ -289,8 +291,7 @@ void Java_org_rocksdb_Options_setWriteBufferManager( * Method: writeBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_writeBufferSize(JNIEnv*, jobject, - jlong jhandle) { +jlong Java_org_rocksdb_Options_writeBufferSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->write_buffer_size; } @@ -301,7 +302,7 @@ jlong Java_org_rocksdb_Options_writeBufferSize(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setMaxWriteBufferNumber( - JNIEnv*, jobject, jlong jhandle, jint jmax_write_buffer_number) { + JNIEnv*, jclass, jlong jhandle, jint jmax_write_buffer_number) { reinterpret_cast(jhandle) ->max_write_buffer_number = jmax_write_buffer_number; } @@ -311,7 +312,7 @@ void Java_org_rocksdb_Options_setMaxWriteBufferNumber( * Method: setStatistics * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setStatistics(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setStatistics(JNIEnv*, jclass, jlong jhandle, jlong jstatistics_handle) { auto* opt = reinterpret_cast(jhandle); auto* pSptr = @@ -325,7 +326,7 @@ void Java_org_rocksdb_Options_setStatistics(JNIEnv*, jobject, jlong jhandle, * Method: statistics * Signature: (J)J */ -jlong Java_org_rocksdb_Options_statistics(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_statistics(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); std::shared_ptr sptr = opt->statistics; if (sptr == nullptr) { @@ -342,7 +343,7 @@ jlong Java_org_rocksdb_Options_statistics(JNIEnv*, jobject, jlong jhandle) { * Method: maxWriteBufferNumber * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxWriteBufferNumber(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_maxWriteBufferNumber(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_write_buffer_number; @@ -353,7 +354,7 @@ jint Java_org_rocksdb_Options_maxWriteBufferNumber(JNIEnv*, jobject, * Method: errorIfExists * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_errorIfExists(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_errorIfExists(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->error_if_exists; @@ -364,7 +365,7 @@ jboolean Java_org_rocksdb_Options_errorIfExists(JNIEnv*, jobject, * Method: setErrorIfExists * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setErrorIfExists(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setErrorIfExists(JNIEnv*, jclass, jlong jhandle, jboolean error_if_exists) { reinterpret_cast(jhandle)->error_if_exists = static_cast(error_if_exists); @@ -375,7 +376,7 @@ void Java_org_rocksdb_Options_setErrorIfExists(JNIEnv*, jobject, jlong jhandle, * Method: paranoidChecks * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_paranoidChecks(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_paranoidChecks(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->paranoid_checks; @@ -386,7 +387,7 @@ jboolean Java_org_rocksdb_Options_paranoidChecks(JNIEnv*, jobject, * Method: setParanoidChecks * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setParanoidChecks(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setParanoidChecks(JNIEnv*, jclass, jlong jhandle, jboolean paranoid_checks) { reinterpret_cast(jhandle)->paranoid_checks = static_cast(paranoid_checks); @@ -397,7 +398,7 @@ void Java_org_rocksdb_Options_setParanoidChecks(JNIEnv*, jobject, jlong jhandle, * Method: setEnv * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setEnv(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setEnv(JNIEnv*, jclass, jlong jhandle, jlong jenv) { reinterpret_cast(jhandle)->env = reinterpret_cast(jenv); @@ -408,8 +409,7 @@ void Java_org_rocksdb_Options_setEnv(JNIEnv*, jobject, jlong jhandle, * Method: setMaxTotalWalSize * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setMaxTotalWalSize(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_Options_setMaxTotalWalSize(JNIEnv*, jclass, jlong jhandle, jlong jmax_total_wal_size) { reinterpret_cast(jhandle)->max_total_wal_size = static_cast(jmax_total_wal_size); @@ -420,8 +420,7 @@ void Java_org_rocksdb_Options_setMaxTotalWalSize(JNIEnv*, jobject, * Method: maxTotalWalSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxTotalWalSize(JNIEnv*, jobject, - jlong jhandle) { +jlong Java_org_rocksdb_Options_maxTotalWalSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_total_wal_size; } @@ -431,7 +430,7 @@ jlong Java_org_rocksdb_Options_maxTotalWalSize(JNIEnv*, jobject, * Method: maxOpenFiles * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxOpenFiles(JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_maxOpenFiles(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->max_open_files; } @@ -440,7 +439,7 @@ jint Java_org_rocksdb_Options_maxOpenFiles(JNIEnv*, jobject, jlong jhandle) { * Method: setMaxOpenFiles * Signature: (JI)V */ -void Java_org_rocksdb_Options_setMaxOpenFiles(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setMaxOpenFiles(JNIEnv*, jclass, jlong jhandle, jint max_open_files) { reinterpret_cast(jhandle)->max_open_files = static_cast(max_open_files); @@ -452,7 +451,7 @@ void Java_org_rocksdb_Options_setMaxOpenFiles(JNIEnv*, jobject, jlong jhandle, * Signature: (JI)V */ void Java_org_rocksdb_Options_setMaxFileOpeningThreads( - JNIEnv*, jobject, jlong jhandle, jint jmax_file_opening_threads) { + JNIEnv*, jclass, jlong jhandle, jint jmax_file_opening_threads) { reinterpret_cast(jhandle) ->max_file_opening_threads = static_cast(jmax_file_opening_threads); } @@ -462,7 +461,7 @@ void Java_org_rocksdb_Options_setMaxFileOpeningThreads( * Method: maxFileOpeningThreads * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxFileOpeningThreads(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_maxFileOpeningThreads(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->max_file_opening_threads); @@ -473,7 +472,7 @@ jint Java_org_rocksdb_Options_maxFileOpeningThreads(JNIEnv*, jobject, * Method: useFsync * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_useFsync(JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_useFsync(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->use_fsync; } @@ -482,7 +481,7 @@ jboolean Java_org_rocksdb_Options_useFsync(JNIEnv*, jobject, jlong jhandle) { * Method: setUseFsync * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setUseFsync(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setUseFsync(JNIEnv*, jclass, jlong jhandle, jboolean use_fsync) { reinterpret_cast(jhandle)->use_fsync = static_cast(use_fsync); @@ -493,7 +492,7 @@ void Java_org_rocksdb_Options_setUseFsync(JNIEnv*, jobject, jlong jhandle, * Method: setDbPaths * Signature: (J[Ljava/lang/String;[J)V */ -void Java_org_rocksdb_Options_setDbPaths(JNIEnv* env, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setDbPaths(JNIEnv* env, jclass, jlong jhandle, jobjectArray jpaths, jlongArray jtarget_sizes) { std::vector db_paths; @@ -539,7 +538,7 @@ void Java_org_rocksdb_Options_setDbPaths(JNIEnv* env, jobject, jlong jhandle, * Method: dbPathsLen * Signature: (J)J */ -jlong Java_org_rocksdb_Options_dbPathsLen(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_dbPathsLen(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->db_paths.size()); } @@ -549,7 +548,7 @@ jlong Java_org_rocksdb_Options_dbPathsLen(JNIEnv*, jobject, jlong jhandle) { * Method: dbPaths * Signature: (J[Ljava/lang/String;[J)V */ -void Java_org_rocksdb_Options_dbPaths(JNIEnv* env, jobject, jlong jhandle, +void Java_org_rocksdb_Options_dbPaths(JNIEnv* env, jclass, jlong jhandle, jobjectArray jpaths, jlongArray jtarget_sizes) { jboolean is_copy; @@ -590,7 +589,7 @@ void Java_org_rocksdb_Options_dbPaths(JNIEnv* env, jobject, jlong jhandle, * Method: dbLogDir * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_Options_dbLogDir(JNIEnv* env, jobject, jlong jhandle) { +jstring Java_org_rocksdb_Options_dbLogDir(JNIEnv* env, jclass, jlong jhandle) { return env->NewStringUTF( reinterpret_cast(jhandle) ->db_log_dir.c_str()); @@ -601,7 +600,7 @@ jstring Java_org_rocksdb_Options_dbLogDir(JNIEnv* env, jobject, jlong jhandle) { * Method: setDbLogDir * Signature: (JLjava/lang/String)V */ -void Java_org_rocksdb_Options_setDbLogDir(JNIEnv* env, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setDbLogDir(JNIEnv* env, jclass, jlong jhandle, jstring jdb_log_dir) { const char* log_dir = env->GetStringUTFChars(jdb_log_dir, nullptr); if (log_dir == nullptr) { @@ -618,7 +617,7 @@ void Java_org_rocksdb_Options_setDbLogDir(JNIEnv* env, jobject, jlong jhandle, * Method: walDir * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_Options_walDir(JNIEnv* env, jobject, jlong jhandle) { +jstring Java_org_rocksdb_Options_walDir(JNIEnv* env, jclass, jlong jhandle) { return env->NewStringUTF( reinterpret_cast(jhandle)->wal_dir.c_str()); } @@ -628,7 +627,7 @@ jstring Java_org_rocksdb_Options_walDir(JNIEnv* env, jobject, jlong jhandle) { * Method: setWalDir * Signature: (JLjava/lang/String)V */ -void Java_org_rocksdb_Options_setWalDir(JNIEnv* env, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setWalDir(JNIEnv* env, jclass, jlong jhandle, jstring jwal_dir) { const char* wal_dir = env->GetStringUTFChars(jwal_dir, nullptr); if (wal_dir == nullptr) { @@ -645,7 +644,7 @@ void Java_org_rocksdb_Options_setWalDir(JNIEnv* env, jobject, jlong jhandle, * Method: deleteObsoleteFilesPeriodMicros * Signature: (J)J */ -jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->delete_obsolete_files_period_micros; @@ -657,7 +656,7 @@ jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setDeleteObsoleteFilesPeriodMicros(JNIEnv*, - jobject, + jclass, jlong jhandle, jlong micros) { reinterpret_cast(jhandle) @@ -669,7 +668,7 @@ void Java_org_rocksdb_Options_setDeleteObsoleteFilesPeriodMicros(JNIEnv*, * Method: maxBackgroundCompactions * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxBackgroundCompactions(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_maxBackgroundCompactions(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_background_compactions; @@ -680,7 +679,7 @@ jint Java_org_rocksdb_Options_maxBackgroundCompactions(JNIEnv*, jobject, * Method: setMaxBackgroundCompactions * Signature: (JI)V */ -void Java_org_rocksdb_Options_setMaxBackgroundCompactions(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setMaxBackgroundCompactions(JNIEnv*, jclass, jlong jhandle, jint max) { reinterpret_cast(jhandle) @@ -692,7 +691,7 @@ void Java_org_rocksdb_Options_setMaxBackgroundCompactions(JNIEnv*, jobject, * Method: setMaxSubcompactions * Signature: (JI)V */ -void Java_org_rocksdb_Options_setMaxSubcompactions(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setMaxSubcompactions(JNIEnv*, jclass, jlong jhandle, jint max) { reinterpret_cast(jhandle)->max_subcompactions = static_cast(max); @@ -703,7 +702,7 @@ void Java_org_rocksdb_Options_setMaxSubcompactions(JNIEnv*, jobject, * Method: maxSubcompactions * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxSubcompactions(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_maxSubcompactions(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_subcompactions; @@ -714,7 +713,7 @@ jint Java_org_rocksdb_Options_maxSubcompactions(JNIEnv*, jobject, * Method: maxBackgroundFlushes * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxBackgroundFlushes(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_maxBackgroundFlushes(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_background_flushes; @@ -726,7 +725,7 @@ jint Java_org_rocksdb_Options_maxBackgroundFlushes(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setMaxBackgroundFlushes( - JNIEnv*, jobject, jlong jhandle, jint max_background_flushes) { + JNIEnv*, jclass, jlong jhandle, jint max_background_flushes) { reinterpret_cast(jhandle) ->max_background_flushes = static_cast(max_background_flushes); } @@ -736,7 +735,7 @@ void Java_org_rocksdb_Options_setMaxBackgroundFlushes( * Method: maxBackgroundJobs * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxBackgroundJobs(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_maxBackgroundJobs(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_background_jobs; @@ -747,7 +746,7 @@ jint Java_org_rocksdb_Options_maxBackgroundJobs(JNIEnv*, jobject, * Method: setMaxBackgroundJobs * Signature: (JI)V */ -void Java_org_rocksdb_Options_setMaxBackgroundJobs(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setMaxBackgroundJobs(JNIEnv*, jclass, jlong jhandle, jint max_background_jobs) { reinterpret_cast(jhandle)->max_background_jobs = @@ -759,7 +758,7 @@ void Java_org_rocksdb_Options_setMaxBackgroundJobs(JNIEnv*, jobject, * Method: maxLogFileSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxLogFileSize(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_maxLogFileSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_log_file_size; } @@ -769,7 +768,7 @@ jlong Java_org_rocksdb_Options_maxLogFileSize(JNIEnv*, jobject, jlong jhandle) { * Method: setMaxLogFileSize * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setMaxLogFileSize(JNIEnv* env, jobject, +void Java_org_rocksdb_Options_setMaxLogFileSize(JNIEnv* env, jclass, jlong jhandle, jlong max_log_file_size) { auto s = @@ -787,7 +786,7 @@ void Java_org_rocksdb_Options_setMaxLogFileSize(JNIEnv* env, jobject, * Method: logFileTimeToRoll * Signature: (J)J */ -jlong Java_org_rocksdb_Options_logFileTimeToRoll(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_logFileTimeToRoll(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->log_file_time_to_roll; @@ -799,7 +798,7 @@ jlong Java_org_rocksdb_Options_logFileTimeToRoll(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setLogFileTimeToRoll( - JNIEnv* env, jobject, jlong jhandle, jlong log_file_time_to_roll) { + JNIEnv* env, jclass, jlong jhandle, jlong log_file_time_to_roll) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( log_file_time_to_roll); if (s.ok()) { @@ -815,7 +814,7 @@ void Java_org_rocksdb_Options_setLogFileTimeToRoll( * Method: keepLogFileNum * Signature: (J)J */ -jlong Java_org_rocksdb_Options_keepLogFileNum(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_keepLogFileNum(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->keep_log_file_num; } @@ -825,7 +824,7 @@ jlong Java_org_rocksdb_Options_keepLogFileNum(JNIEnv*, jobject, jlong jhandle) { * Method: setKeepLogFileNum * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setKeepLogFileNum(JNIEnv* env, jobject, +void Java_org_rocksdb_Options_setKeepLogFileNum(JNIEnv* env, jclass, jlong jhandle, jlong keep_log_file_num) { auto s = @@ -843,7 +842,7 @@ void Java_org_rocksdb_Options_setKeepLogFileNum(JNIEnv* env, jobject, * Method: recycleLogFileNum * Signature: (J)J */ -jlong Java_org_rocksdb_Options_recycleLogFileNum(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_recycleLogFileNum(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->recycle_log_file_num; @@ -854,7 +853,7 @@ jlong Java_org_rocksdb_Options_recycleLogFileNum(JNIEnv*, jobject, * Method: setRecycleLogFileNum * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setRecycleLogFileNum(JNIEnv* env, jobject, +void Java_org_rocksdb_Options_setRecycleLogFileNum(JNIEnv* env, jclass, jlong jhandle, jlong recycle_log_file_num) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( @@ -872,7 +871,7 @@ void Java_org_rocksdb_Options_setRecycleLogFileNum(JNIEnv* env, jobject, * Method: maxManifestFileSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxManifestFileSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_maxManifestFileSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_manifest_file_size; @@ -882,7 +881,7 @@ jlong Java_org_rocksdb_Options_maxManifestFileSize(JNIEnv*, jobject, * Method: memTableFactoryName * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_Options_memTableFactoryName(JNIEnv* env, jobject, +jstring Java_org_rocksdb_Options_memTableFactoryName(JNIEnv* env, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::MemTableRepFactory* tf = opt->memtable_factory.get(); @@ -1034,7 +1033,7 @@ void Java_org_rocksdb_Options_cfPaths(JNIEnv* env, jclass, jlong jhandle, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setMaxManifestFileSize( - JNIEnv*, jobject, jlong jhandle, jlong max_manifest_file_size) { + JNIEnv*, jclass, jlong jhandle, jlong max_manifest_file_size) { reinterpret_cast(jhandle) ->max_manifest_file_size = static_cast(max_manifest_file_size); } @@ -1043,8 +1042,7 @@ void Java_org_rocksdb_Options_setMaxManifestFileSize( * Method: setMemTableFactory * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setMemTableFactory(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_Options_setMemTableFactory(JNIEnv*, jclass, jlong jhandle, jlong jfactory_handle) { reinterpret_cast(jhandle) ->memtable_factory.reset( @@ -1057,7 +1055,7 @@ void Java_org_rocksdb_Options_setMemTableFactory(JNIEnv*, jobject, * Method: setRateLimiter * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setRateLimiter(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setRateLimiter(JNIEnv*, jclass, jlong jhandle, jlong jrate_limiter_handle) { std::shared_ptr* pRateLimiter = reinterpret_cast*>( @@ -1072,7 +1070,7 @@ void Java_org_rocksdb_Options_setRateLimiter(JNIEnv*, jobject, jlong jhandle, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setSstFileManager( - JNIEnv*, jobject, jlong jhandle, jlong jsst_file_manager_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jsst_file_manager_handle) { auto* sptr_sst_file_manager = reinterpret_cast*>( jsst_file_manager_handle); @@ -1083,14 +1081,31 @@ void Java_org_rocksdb_Options_setSstFileManager( /* * Class: org_rocksdb_Options * Method: setLogger - * Signature: (JJ)V + * Signature: (JJB)V */ -void Java_org_rocksdb_Options_setLogger(JNIEnv*, jobject, jlong jhandle, - jlong jlogger_handle) { - std::shared_ptr* pLogger = - reinterpret_cast*>( - jlogger_handle); - reinterpret_cast(jhandle)->info_log = *pLogger; +void Java_org_rocksdb_Options_setLogger(JNIEnv* env, jclass, jlong jhandle, + jlong jlogger_handle, + jbyte jlogger_type) { + auto* options = reinterpret_cast(jhandle); + switch (jlogger_type) { + case 0x1: + // JAVA_IMPLEMENTATION + options->info_log = + *(reinterpret_cast< + std::shared_ptr*>( + jlogger_handle)); + break; + case 0x2: + // STDERR_IMPLEMENTATION + options->info_log = + *(reinterpret_cast*>( + jlogger_handle)); + break; + default: + ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew( + env, ROCKSDB_NAMESPACE::Status::InvalidArgument( + ROCKSDB_NAMESPACE::Slice("Unknown value for LoggerType"))); + } } /* @@ -1098,7 +1113,7 @@ void Java_org_rocksdb_Options_setLogger(JNIEnv*, jobject, jlong jhandle, * Method: setInfoLogLevel * Signature: (JB)V */ -void Java_org_rocksdb_Options_setInfoLogLevel(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setInfoLogLevel(JNIEnv*, jclass, jlong jhandle, jbyte jlog_level) { reinterpret_cast(jhandle)->info_log_level = static_cast(jlog_level); @@ -1109,7 +1124,7 @@ void Java_org_rocksdb_Options_setInfoLogLevel(JNIEnv*, jobject, jlong jhandle, * Method: infoLogLevel * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_infoLogLevel(JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_Options_infoLogLevel(JNIEnv*, jclass, jlong jhandle) { return static_cast( reinterpret_cast(jhandle)->info_log_level); } @@ -1119,7 +1134,7 @@ jbyte Java_org_rocksdb_Options_infoLogLevel(JNIEnv*, jobject, jlong jhandle) { * Method: tableCacheNumshardbits * Signature: (J)I */ -jint Java_org_rocksdb_Options_tableCacheNumshardbits(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_tableCacheNumshardbits(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->table_cache_numshardbits; @@ -1131,7 +1146,7 @@ jint Java_org_rocksdb_Options_tableCacheNumshardbits(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setTableCacheNumshardbits( - JNIEnv*, jobject, jlong jhandle, jint table_cache_numshardbits) { + JNIEnv*, jclass, jlong jhandle, jint table_cache_numshardbits) { reinterpret_cast(jhandle) ->table_cache_numshardbits = static_cast(table_cache_numshardbits); } @@ -1141,7 +1156,7 @@ void Java_org_rocksdb_Options_setTableCacheNumshardbits( * Signature: (JI)V */ void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor( - JNIEnv*, jobject, jlong jhandle, jint jprefix_length) { + JNIEnv*, jclass, jlong jhandle, jint jprefix_length) { reinterpret_cast(jhandle) ->prefix_extractor.reset(ROCKSDB_NAMESPACE::NewFixedPrefixTransform( static_cast(jprefix_length))); @@ -1151,7 +1166,7 @@ void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor( * Method: useCappedPrefixExtractor * Signature: (JI)V */ -void Java_org_rocksdb_Options_useCappedPrefixExtractor(JNIEnv*, jobject, +void Java_org_rocksdb_Options_useCappedPrefixExtractor(JNIEnv*, jclass, jlong jhandle, jint jprefix_length) { reinterpret_cast(jhandle) @@ -1164,7 +1179,7 @@ void Java_org_rocksdb_Options_useCappedPrefixExtractor(JNIEnv*, jobject, * Method: walTtlSeconds * Signature: (J)J */ -jlong Java_org_rocksdb_Options_walTtlSeconds(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_walTtlSeconds(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->WAL_ttl_seconds; } @@ -1174,7 +1189,7 @@ jlong Java_org_rocksdb_Options_walTtlSeconds(JNIEnv*, jobject, jlong jhandle) { * Method: setWalTtlSeconds * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWalTtlSeconds(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setWalTtlSeconds(JNIEnv*, jclass, jlong jhandle, jlong WAL_ttl_seconds) { reinterpret_cast(jhandle)->WAL_ttl_seconds = static_cast(WAL_ttl_seconds); @@ -1185,7 +1200,7 @@ void Java_org_rocksdb_Options_setWalTtlSeconds(JNIEnv*, jobject, jlong jhandle, * Method: walTtlSeconds * Signature: (J)J */ -jlong Java_org_rocksdb_Options_walSizeLimitMB(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_walSizeLimitMB(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->WAL_size_limit_MB; } @@ -1195,7 +1210,7 @@ jlong Java_org_rocksdb_Options_walSizeLimitMB(JNIEnv*, jobject, jlong jhandle) { * Method: setWalSizeLimitMB * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWalSizeLimitMB(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setWalSizeLimitMB(JNIEnv*, jclass, jlong jhandle, jlong WAL_size_limit_MB) { reinterpret_cast(jhandle)->WAL_size_limit_MB = static_cast(WAL_size_limit_MB); @@ -1229,7 +1244,7 @@ jlong Java_org_rocksdb_Options_maxWriteBatchGroupSizeBytes(JNIEnv*, jclass, * Method: manifestPreallocationSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_manifestPreallocationSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_manifestPreallocationSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->manifest_preallocation_size; @@ -1241,7 +1256,7 @@ jlong Java_org_rocksdb_Options_manifestPreallocationSize(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setManifestPreallocationSize( - JNIEnv* env, jobject, jlong jhandle, jlong preallocation_size) { + JNIEnv* env, jclass, jlong jhandle, jlong preallocation_size) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( preallocation_size); if (s.ok()) { @@ -1256,7 +1271,7 @@ void Java_org_rocksdb_Options_setManifestPreallocationSize( * Method: setTableFactory * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setTableFactory(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setTableFactory(JNIEnv*, jclass, jlong jhandle, jlong jtable_factory_handle) { auto* options = reinterpret_cast(jhandle); auto* table_factory = @@ -1268,7 +1283,7 @@ void Java_org_rocksdb_Options_setTableFactory(JNIEnv*, jobject, jlong jhandle, * Method: setSstPartitionerFactory * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setSstPartitionerFactory(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setSstPartitionerFactory(JNIEnv*, jclass, jlong jhandle, jlong factory_handle) { auto* options = reinterpret_cast(jhandle); @@ -1297,7 +1312,7 @@ void Java_org_rocksdb_Options_setCompactionThreadLimiter( * Method: allowMmapReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_allowMmapReads(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_allowMmapReads(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->allow_mmap_reads; @@ -1308,7 +1323,7 @@ jboolean Java_org_rocksdb_Options_allowMmapReads(JNIEnv*, jobject, * Method: setAllowMmapReads * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setAllowMmapReads(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setAllowMmapReads(JNIEnv*, jclass, jlong jhandle, jboolean allow_mmap_reads) { reinterpret_cast(jhandle)->allow_mmap_reads = static_cast(allow_mmap_reads); @@ -1319,7 +1334,7 @@ void Java_org_rocksdb_Options_setAllowMmapReads(JNIEnv*, jobject, jlong jhandle, * Method: allowMmapWrites * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_allowMmapWrites(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_allowMmapWrites(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->allow_mmap_writes; @@ -1330,8 +1345,7 @@ jboolean Java_org_rocksdb_Options_allowMmapWrites(JNIEnv*, jobject, * Method: setAllowMmapWrites * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setAllowMmapWrites(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_Options_setAllowMmapWrites(JNIEnv*, jclass, jlong jhandle, jboolean allow_mmap_writes) { reinterpret_cast(jhandle)->allow_mmap_writes = static_cast(allow_mmap_writes); @@ -1342,7 +1356,7 @@ void Java_org_rocksdb_Options_setAllowMmapWrites(JNIEnv*, jobject, * Method: useDirectReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_useDirectReads(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_useDirectReads(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->use_direct_reads; @@ -1353,7 +1367,7 @@ jboolean Java_org_rocksdb_Options_useDirectReads(JNIEnv*, jobject, * Method: setUseDirectReads * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setUseDirectReads(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setUseDirectReads(JNIEnv*, jclass, jlong jhandle, jboolean use_direct_reads) { reinterpret_cast(jhandle)->use_direct_reads = static_cast(use_direct_reads); @@ -1365,7 +1379,7 @@ void Java_org_rocksdb_Options_setUseDirectReads(JNIEnv*, jobject, jlong jhandle, * Signature: (J)Z */ jboolean Java_org_rocksdb_Options_useDirectIoForFlushAndCompaction( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->use_direct_io_for_flush_and_compaction; } @@ -1376,7 +1390,7 @@ jboolean Java_org_rocksdb_Options_useDirectIoForFlushAndCompaction( * Signature: (JZ)V */ void Java_org_rocksdb_Options_setUseDirectIoForFlushAndCompaction( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jboolean use_direct_io_for_flush_and_compaction) { reinterpret_cast(jhandle) ->use_direct_io_for_flush_and_compaction = @@ -1388,7 +1402,7 @@ void Java_org_rocksdb_Options_setUseDirectIoForFlushAndCompaction( * Method: setAllowFAllocate * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setAllowFAllocate(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setAllowFAllocate(JNIEnv*, jclass, jlong jhandle, jboolean jallow_fallocate) { reinterpret_cast(jhandle)->allow_fallocate = static_cast(jallow_fallocate); @@ -1399,7 +1413,7 @@ void Java_org_rocksdb_Options_setAllowFAllocate(JNIEnv*, jobject, jlong jhandle, * Method: allowFAllocate * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_allowFAllocate(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_allowFAllocate(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_fallocate); @@ -1410,7 +1424,7 @@ jboolean Java_org_rocksdb_Options_allowFAllocate(JNIEnv*, jobject, * Method: isFdCloseOnExec * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_isFdCloseOnExec(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_isFdCloseOnExec(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->is_fd_close_on_exec; @@ -1421,8 +1435,7 @@ jboolean Java_org_rocksdb_Options_isFdCloseOnExec(JNIEnv*, jobject, * Method: setIsFdCloseOnExec * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setIsFdCloseOnExec(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_Options_setIsFdCloseOnExec(JNIEnv*, jclass, jlong jhandle, jboolean is_fd_close_on_exec) { reinterpret_cast(jhandle)->is_fd_close_on_exec = static_cast(is_fd_close_on_exec); @@ -1433,7 +1446,7 @@ void Java_org_rocksdb_Options_setIsFdCloseOnExec(JNIEnv*, jobject, * Method: statsDumpPeriodSec * Signature: (J)I */ -jint Java_org_rocksdb_Options_statsDumpPeriodSec(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_statsDumpPeriodSec(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->stats_dump_period_sec; @@ -1445,7 +1458,7 @@ jint Java_org_rocksdb_Options_statsDumpPeriodSec(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setStatsDumpPeriodSec( - JNIEnv*, jobject, jlong jhandle, jint jstats_dump_period_sec) { + JNIEnv*, jclass, jlong jhandle, jint jstats_dump_period_sec) { reinterpret_cast(jhandle) ->stats_dump_period_sec = static_cast(jstats_dump_period_sec); @@ -1456,7 +1469,7 @@ void Java_org_rocksdb_Options_setStatsDumpPeriodSec( * Method: statsPersistPeriodSec * Signature: (J)I */ -jint Java_org_rocksdb_Options_statsPersistPeriodSec(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_statsPersistPeriodSec(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->stats_persist_period_sec; @@ -1468,7 +1481,7 @@ jint Java_org_rocksdb_Options_statsPersistPeriodSec(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setStatsPersistPeriodSec( - JNIEnv*, jobject, jlong jhandle, jint jstats_persist_period_sec) { + JNIEnv*, jclass, jlong jhandle, jint jstats_persist_period_sec) { reinterpret_cast(jhandle) ->stats_persist_period_sec = static_cast(jstats_persist_period_sec); @@ -1479,7 +1492,7 @@ void Java_org_rocksdb_Options_setStatsPersistPeriodSec( * Method: statsHistoryBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_statsHistoryBufferSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_statsHistoryBufferSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->stats_history_buffer_size; @@ -1491,7 +1504,7 @@ jlong Java_org_rocksdb_Options_statsHistoryBufferSize(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setStatsHistoryBufferSize( - JNIEnv*, jobject, jlong jhandle, jlong jstats_history_buffer_size) { + JNIEnv*, jclass, jlong jhandle, jlong jstats_history_buffer_size) { reinterpret_cast(jhandle) ->stats_history_buffer_size = static_cast(jstats_history_buffer_size); @@ -1502,7 +1515,7 @@ void Java_org_rocksdb_Options_setStatsHistoryBufferSize( * Method: adviseRandomOnOpen * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_adviseRandomOnOpen(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_adviseRandomOnOpen(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->advise_random_on_open; @@ -1514,7 +1527,7 @@ jboolean Java_org_rocksdb_Options_adviseRandomOnOpen(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setAdviseRandomOnOpen( - JNIEnv*, jobject, jlong jhandle, jboolean advise_random_on_open) { + JNIEnv*, jclass, jlong jhandle, jboolean advise_random_on_open) { reinterpret_cast(jhandle) ->advise_random_on_open = static_cast(advise_random_on_open); } @@ -1525,7 +1538,7 @@ void Java_org_rocksdb_Options_setAdviseRandomOnOpen( * Signature: (JJ)V */ void Java_org_rocksdb_Options_setDbWriteBufferSize( - JNIEnv*, jobject, jlong jhandle, jlong jdb_write_buffer_size) { + JNIEnv*, jclass, jlong jhandle, jlong jdb_write_buffer_size) { auto* opt = reinterpret_cast(jhandle); opt->db_write_buffer_size = static_cast(jdb_write_buffer_size); } @@ -1535,43 +1548,19 @@ void Java_org_rocksdb_Options_setDbWriteBufferSize( * Method: dbWriteBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_dbWriteBufferSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_dbWriteBufferSize(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->db_write_buffer_size); } -/* - * Class: org_rocksdb_Options - * Method: setAccessHintOnCompactionStart - * Signature: (JB)V - */ -void Java_org_rocksdb_Options_setAccessHintOnCompactionStart( - JNIEnv*, jobject, jlong jhandle, jbyte jaccess_hint_value) { - auto* opt = reinterpret_cast(jhandle); - opt->access_hint_on_compaction_start = - ROCKSDB_NAMESPACE::AccessHintJni::toCppAccessHint(jaccess_hint_value); -} - -/* - * Class: org_rocksdb_Options - * Method: accessHintOnCompactionStart - * Signature: (J)B - */ -jbyte Java_org_rocksdb_Options_accessHintOnCompactionStart(JNIEnv*, jobject, - jlong jhandle) { - auto* opt = reinterpret_cast(jhandle); - return ROCKSDB_NAMESPACE::AccessHintJni::toJavaAccessHint( - opt->access_hint_on_compaction_start); -} - /* * Class: org_rocksdb_Options * Method: setCompactionReadaheadSize * Signature: (JJ)V */ void Java_org_rocksdb_Options_setCompactionReadaheadSize( - JNIEnv*, jobject, jlong jhandle, jlong jcompaction_readahead_size) { + JNIEnv*, jclass, jlong jhandle, jlong jcompaction_readahead_size) { auto* opt = reinterpret_cast(jhandle); opt->compaction_readahead_size = static_cast(jcompaction_readahead_size); @@ -1582,7 +1571,7 @@ void Java_org_rocksdb_Options_setCompactionReadaheadSize( * Method: compactionReadaheadSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_compactionReadaheadSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_compactionReadaheadSize(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->compaction_readahead_size); @@ -1594,7 +1583,7 @@ jlong Java_org_rocksdb_Options_compactionReadaheadSize(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setRandomAccessMaxBufferSize( - JNIEnv*, jobject, jlong jhandle, jlong jrandom_access_max_buffer_size) { + JNIEnv*, jclass, jlong jhandle, jlong jrandom_access_max_buffer_size) { auto* opt = reinterpret_cast(jhandle); opt->random_access_max_buffer_size = static_cast(jrandom_access_max_buffer_size); @@ -1605,7 +1594,7 @@ void Java_org_rocksdb_Options_setRandomAccessMaxBufferSize( * Method: randomAccessMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_randomAccessMaxBufferSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_randomAccessMaxBufferSize(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->random_access_max_buffer_size); @@ -1617,7 +1606,7 @@ jlong Java_org_rocksdb_Options_randomAccessMaxBufferSize(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setWritableFileMaxBufferSize( - JNIEnv*, jobject, jlong jhandle, jlong jwritable_file_max_buffer_size) { + JNIEnv*, jclass, jlong jhandle, jlong jwritable_file_max_buffer_size) { auto* opt = reinterpret_cast(jhandle); opt->writable_file_max_buffer_size = static_cast(jwritable_file_max_buffer_size); @@ -1628,7 +1617,7 @@ void Java_org_rocksdb_Options_setWritableFileMaxBufferSize( * Method: writableFileMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_writableFileMaxBufferSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_writableFileMaxBufferSize(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->writable_file_max_buffer_size); @@ -1639,7 +1628,7 @@ jlong Java_org_rocksdb_Options_writableFileMaxBufferSize(JNIEnv*, jobject, * Method: useAdaptiveMutex * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_useAdaptiveMutex(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_useAdaptiveMutex(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->use_adaptive_mutex; @@ -1650,7 +1639,7 @@ jboolean Java_org_rocksdb_Options_useAdaptiveMutex(JNIEnv*, jobject, * Method: setUseAdaptiveMutex * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setUseAdaptiveMutex(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setUseAdaptiveMutex(JNIEnv*, jclass, jlong jhandle, jboolean use_adaptive_mutex) { reinterpret_cast(jhandle)->use_adaptive_mutex = @@ -1662,7 +1651,7 @@ void Java_org_rocksdb_Options_setUseAdaptiveMutex(JNIEnv*, jobject, * Method: bytesPerSync * Signature: (J)J */ -jlong Java_org_rocksdb_Options_bytesPerSync(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_bytesPerSync(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->bytes_per_sync; } @@ -1671,7 +1660,7 @@ jlong Java_org_rocksdb_Options_bytesPerSync(JNIEnv*, jobject, jlong jhandle) { * Method: setBytesPerSync * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setBytesPerSync(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setBytesPerSync(JNIEnv*, jclass, jlong jhandle, jlong bytes_per_sync) { reinterpret_cast(jhandle)->bytes_per_sync = static_cast(bytes_per_sync); @@ -1682,8 +1671,7 @@ void Java_org_rocksdb_Options_setBytesPerSync(JNIEnv*, jobject, jlong jhandle, * Method: setWalBytesPerSync * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWalBytesPerSync(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_Options_setWalBytesPerSync(JNIEnv*, jclass, jlong jhandle, jlong jwal_bytes_per_sync) { reinterpret_cast(jhandle)->wal_bytes_per_sync = static_cast(jwal_bytes_per_sync); @@ -1694,8 +1682,7 @@ void Java_org_rocksdb_Options_setWalBytesPerSync(JNIEnv*, jobject, * Method: walBytesPerSync * Signature: (J)J */ -jlong Java_org_rocksdb_Options_walBytesPerSync(JNIEnv*, jobject, - jlong jhandle) { +jlong Java_org_rocksdb_Options_walBytesPerSync(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->wal_bytes_per_sync); } @@ -1706,7 +1693,7 @@ jlong Java_org_rocksdb_Options_walBytesPerSync(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setStrictBytesPerSync( - JNIEnv*, jobject, jlong jhandle, jboolean jstrict_bytes_per_sync) { + JNIEnv*, jclass, jlong jhandle, jboolean jstrict_bytes_per_sync) { reinterpret_cast(jhandle) ->strict_bytes_per_sync = jstrict_bytes_per_sync == JNI_TRUE; } @@ -1716,7 +1703,7 @@ void Java_org_rocksdb_Options_setStrictBytesPerSync( * Method: strictBytesPerSync * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_strictBytesPerSync(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_strictBytesPerSync(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->strict_bytes_per_sync); @@ -1798,7 +1785,7 @@ jobjectArray Java_org_rocksdb_Options_eventListeners(JNIEnv* env, jclass, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setEnableThreadTracking( - JNIEnv*, jobject, jlong jhandle, jboolean jenable_thread_tracking) { + JNIEnv*, jclass, jlong jhandle, jboolean jenable_thread_tracking) { auto* opt = reinterpret_cast(jhandle); opt->enable_thread_tracking = static_cast(jenable_thread_tracking); } @@ -1808,7 +1795,7 @@ void Java_org_rocksdb_Options_setEnableThreadTracking( * Method: enableThreadTracking * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_enableThreadTracking(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_enableThreadTracking(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->enable_thread_tracking); @@ -1819,7 +1806,7 @@ jboolean Java_org_rocksdb_Options_enableThreadTracking(JNIEnv*, jobject, * Method: setDelayedWriteRate * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setDelayedWriteRate(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setDelayedWriteRate(JNIEnv*, jclass, jlong jhandle, jlong jdelayed_write_rate) { auto* opt = reinterpret_cast(jhandle); @@ -1831,7 +1818,7 @@ void Java_org_rocksdb_Options_setDelayedWriteRate(JNIEnv*, jobject, * Method: delayedWriteRate * Signature: (J)J */ -jlong Java_org_rocksdb_Options_delayedWriteRate(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_delayedWriteRate(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->delayed_write_rate); @@ -1843,7 +1830,7 @@ jlong Java_org_rocksdb_Options_delayedWriteRate(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setEnablePipelinedWrite( - JNIEnv*, jobject, jlong jhandle, jboolean jenable_pipelined_write) { + JNIEnv*, jclass, jlong jhandle, jboolean jenable_pipelined_write) { auto* opt = reinterpret_cast(jhandle); opt->enable_pipelined_write = jenable_pipelined_write == JNI_TRUE; } @@ -1853,7 +1840,7 @@ void Java_org_rocksdb_Options_setEnablePipelinedWrite( * Method: enablePipelinedWrite * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_enablePipelinedWrite(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_enablePipelinedWrite(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->enable_pipelined_write); @@ -1864,7 +1851,7 @@ jboolean Java_org_rocksdb_Options_enablePipelinedWrite(JNIEnv*, jobject, * Method: setUnorderedWrite * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setUnorderedWrite(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setUnorderedWrite(JNIEnv*, jclass, jlong jhandle, jboolean unordered_write) { reinterpret_cast(jhandle)->unordered_write = static_cast(unordered_write); @@ -1875,7 +1862,7 @@ void Java_org_rocksdb_Options_setUnorderedWrite(JNIEnv*, jobject, jlong jhandle, * Method: unorderedWrite * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_unorderedWrite(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_unorderedWrite(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->unordered_write; @@ -1886,7 +1873,7 @@ jboolean Java_org_rocksdb_Options_unorderedWrite(JNIEnv*, jobject, * Method: setAllowConcurrentMemtableWrite * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setAllowConcurrentMemtableWrite(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setAllowConcurrentMemtableWrite(JNIEnv*, jclass, jlong jhandle, jboolean allow) { reinterpret_cast(jhandle) @@ -1898,7 +1885,7 @@ void Java_org_rocksdb_Options_setAllowConcurrentMemtableWrite(JNIEnv*, jobject, * Method: allowConcurrentMemtableWrite * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_allowConcurrentMemtableWrite(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_allowConcurrentMemtableWrite(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->allow_concurrent_memtable_write; @@ -1910,7 +1897,7 @@ jboolean Java_org_rocksdb_Options_allowConcurrentMemtableWrite(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setEnableWriteThreadAdaptiveYield( - JNIEnv*, jobject, jlong jhandle, jboolean yield) { + JNIEnv*, jclass, jlong jhandle, jboolean yield) { reinterpret_cast(jhandle) ->enable_write_thread_adaptive_yield = static_cast(yield); } @@ -1921,7 +1908,7 @@ void Java_org_rocksdb_Options_setEnableWriteThreadAdaptiveYield( * Signature: (J)Z */ jboolean Java_org_rocksdb_Options_enableWriteThreadAdaptiveYield( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->enable_write_thread_adaptive_yield; } @@ -1931,7 +1918,7 @@ jboolean Java_org_rocksdb_Options_enableWriteThreadAdaptiveYield( * Method: setWriteThreadMaxYieldUsec * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWriteThreadMaxYieldUsec(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setWriteThreadMaxYieldUsec(JNIEnv*, jclass, jlong jhandle, jlong max) { reinterpret_cast(jhandle) @@ -1943,7 +1930,7 @@ void Java_org_rocksdb_Options_setWriteThreadMaxYieldUsec(JNIEnv*, jobject, * Method: writeThreadMaxYieldUsec * Signature: (J)J */ -jlong Java_org_rocksdb_Options_writeThreadMaxYieldUsec(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_writeThreadMaxYieldUsec(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->write_thread_max_yield_usec; @@ -1954,7 +1941,7 @@ jlong Java_org_rocksdb_Options_writeThreadMaxYieldUsec(JNIEnv*, jobject, * Method: setWriteThreadSlowYieldUsec * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWriteThreadSlowYieldUsec(JNIEnv*, jobject, +void Java_org_rocksdb_Options_setWriteThreadSlowYieldUsec(JNIEnv*, jclass, jlong jhandle, jlong slow) { reinterpret_cast(jhandle) @@ -1966,7 +1953,7 @@ void Java_org_rocksdb_Options_setWriteThreadSlowYieldUsec(JNIEnv*, jobject, * Method: writeThreadSlowYieldUsec * Signature: (J)J */ -jlong Java_org_rocksdb_Options_writeThreadSlowYieldUsec(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_writeThreadSlowYieldUsec(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->write_thread_slow_yield_usec; @@ -1978,7 +1965,7 @@ jlong Java_org_rocksdb_Options_writeThreadSlowYieldUsec(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setSkipStatsUpdateOnDbOpen( - JNIEnv*, jobject, jlong jhandle, jboolean jskip_stats_update_on_db_open) { + JNIEnv*, jclass, jlong jhandle, jboolean jskip_stats_update_on_db_open) { auto* opt = reinterpret_cast(jhandle); opt->skip_stats_update_on_db_open = static_cast(jskip_stats_update_on_db_open); @@ -1989,7 +1976,7 @@ void Java_org_rocksdb_Options_setSkipStatsUpdateOnDbOpen( * Method: skipStatsUpdateOnDbOpen * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->skip_stats_update_on_db_open); @@ -2025,7 +2012,7 @@ jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen( * Signature: (JB)V */ void Java_org_rocksdb_Options_setWalRecoveryMode( - JNIEnv*, jobject, jlong jhandle, jbyte jwal_recovery_mode_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jwal_recovery_mode_value) { auto* opt = reinterpret_cast(jhandle); opt->wal_recovery_mode = ROCKSDB_NAMESPACE::WALRecoveryModeJni::toCppWALRecoveryMode( @@ -2037,8 +2024,7 @@ void Java_org_rocksdb_Options_setWalRecoveryMode( * Method: walRecoveryMode * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_walRecoveryMode(JNIEnv*, jobject, - jlong jhandle) { +jbyte Java_org_rocksdb_Options_walRecoveryMode(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::WALRecoveryModeJni::toJavaWALRecoveryMode( opt->wal_recovery_mode); @@ -2049,7 +2035,7 @@ jbyte Java_org_rocksdb_Options_walRecoveryMode(JNIEnv*, jobject, * Method: setAllow2pc * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setAllow2pc(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setAllow2pc(JNIEnv*, jclass, jlong jhandle, jboolean jallow_2pc) { auto* opt = reinterpret_cast(jhandle); opt->allow_2pc = static_cast(jallow_2pc); @@ -2060,7 +2046,7 @@ void Java_org_rocksdb_Options_setAllow2pc(JNIEnv*, jobject, jlong jhandle, * Method: allow2pc * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_allow2pc(JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_allow2pc(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_2pc); } @@ -2070,7 +2056,7 @@ jboolean Java_org_rocksdb_Options_allow2pc(JNIEnv*, jobject, jlong jhandle) { * Method: setRowCache * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setRowCache(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setRowCache(JNIEnv*, jclass, jlong jhandle, jlong jrow_cache_handle) { auto* opt = reinterpret_cast(jhandle); auto* row_cache = @@ -2084,7 +2070,7 @@ void Java_org_rocksdb_Options_setRowCache(JNIEnv*, jobject, jlong jhandle, * Method: setWalFilter * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWalFilter(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setWalFilter(JNIEnv*, jclass, jlong jhandle, jlong jwal_filter_handle) { auto* opt = reinterpret_cast(jhandle); auto* wal_filter = reinterpret_cast( @@ -2098,7 +2084,7 @@ void Java_org_rocksdb_Options_setWalFilter(JNIEnv*, jobject, jlong jhandle, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setFailIfOptionsFileError( - JNIEnv*, jobject, jlong jhandle, jboolean jfail_if_options_file_error) { + JNIEnv*, jclass, jlong jhandle, jboolean jfail_if_options_file_error) { auto* opt = reinterpret_cast(jhandle); opt->fail_if_options_file_error = static_cast(jfail_if_options_file_error); @@ -2109,7 +2095,7 @@ void Java_org_rocksdb_Options_setFailIfOptionsFileError( * Method: failIfOptionsFileError * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_failIfOptionsFileError(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_failIfOptionsFileError(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->fail_if_options_file_error); @@ -2120,8 +2106,7 @@ jboolean Java_org_rocksdb_Options_failIfOptionsFileError(JNIEnv*, jobject, * Method: setDumpMallocStats * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setDumpMallocStats(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_Options_setDumpMallocStats(JNIEnv*, jclass, jlong jhandle, jboolean jdump_malloc_stats) { auto* opt = reinterpret_cast(jhandle); opt->dump_malloc_stats = static_cast(jdump_malloc_stats); @@ -2132,7 +2117,7 @@ void Java_org_rocksdb_Options_setDumpMallocStats(JNIEnv*, jobject, * Method: dumpMallocStats * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_dumpMallocStats(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_dumpMallocStats(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->dump_malloc_stats); @@ -2144,7 +2129,7 @@ jboolean Java_org_rocksdb_Options_dumpMallocStats(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setAvoidFlushDuringRecovery( - JNIEnv*, jobject, jlong jhandle, jboolean javoid_flush_during_recovery) { + JNIEnv*, jclass, jlong jhandle, jboolean javoid_flush_during_recovery) { auto* opt = reinterpret_cast(jhandle); opt->avoid_flush_during_recovery = static_cast(javoid_flush_during_recovery); @@ -2155,7 +2140,7 @@ void Java_org_rocksdb_Options_setAvoidFlushDuringRecovery( * Method: avoidFlushDuringRecovery * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_avoidFlushDuringRecovery(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_avoidFlushDuringRecovery(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->avoid_flush_during_recovery); @@ -2323,7 +2308,7 @@ jlong Java_org_rocksdb_Options_bgerrorResumeRetryInterval(JNIEnv*, jclass, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setAvoidFlushDuringShutdown( - JNIEnv*, jobject, jlong jhandle, jboolean javoid_flush_during_shutdown) { + JNIEnv*, jclass, jlong jhandle, jboolean javoid_flush_during_shutdown) { auto* opt = reinterpret_cast(jhandle); opt->avoid_flush_during_shutdown = static_cast(javoid_flush_during_shutdown); @@ -2334,7 +2319,7 @@ void Java_org_rocksdb_Options_setAvoidFlushDuringShutdown( * Method: avoidFlushDuringShutdown * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_avoidFlushDuringShutdown(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_avoidFlushDuringShutdown(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->avoid_flush_during_shutdown); @@ -2346,7 +2331,7 @@ jboolean Java_org_rocksdb_Options_avoidFlushDuringShutdown(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setAllowIngestBehind( - JNIEnv*, jobject, jlong jhandle, jboolean jallow_ingest_behind) { + JNIEnv*, jclass, jlong jhandle, jboolean jallow_ingest_behind) { auto* opt = reinterpret_cast(jhandle); opt->allow_ingest_behind = jallow_ingest_behind == JNI_TRUE; } @@ -2356,7 +2341,7 @@ void Java_org_rocksdb_Options_setAllowIngestBehind( * Method: allowIngestBehind * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_allowIngestBehind(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_allowIngestBehind(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_ingest_behind); @@ -2367,7 +2352,7 @@ jboolean Java_org_rocksdb_Options_allowIngestBehind(JNIEnv*, jobject, * Method: setTwoWriteQueues * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setTwoWriteQueues(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setTwoWriteQueues(JNIEnv*, jclass, jlong jhandle, jboolean jtwo_write_queues) { auto* opt = reinterpret_cast(jhandle); opt->two_write_queues = jtwo_write_queues == JNI_TRUE; @@ -2378,7 +2363,7 @@ void Java_org_rocksdb_Options_setTwoWriteQueues(JNIEnv*, jobject, jlong jhandle, * Method: twoWriteQueues * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_twoWriteQueues(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_twoWriteQueues(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->two_write_queues); @@ -2389,7 +2374,7 @@ jboolean Java_org_rocksdb_Options_twoWriteQueues(JNIEnv*, jobject, * Method: setManualWalFlush * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setManualWalFlush(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setManualWalFlush(JNIEnv*, jclass, jlong jhandle, jboolean jmanual_wal_flush) { auto* opt = reinterpret_cast(jhandle); opt->manual_wal_flush = jmanual_wal_flush == JNI_TRUE; @@ -2400,7 +2385,7 @@ void Java_org_rocksdb_Options_setManualWalFlush(JNIEnv*, jobject, jlong jhandle, * Method: manualWalFlush * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_manualWalFlush(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_manualWalFlush(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->manual_wal_flush); @@ -2411,7 +2396,7 @@ jboolean Java_org_rocksdb_Options_manualWalFlush(JNIEnv*, jobject, * Method: setAtomicFlush * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setAtomicFlush(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setAtomicFlush(JNIEnv*, jclass, jlong jhandle, jboolean jatomic_flush) { auto* opt = reinterpret_cast(jhandle); opt->atomic_flush = jatomic_flush == JNI_TRUE; @@ -2422,7 +2407,7 @@ void Java_org_rocksdb_Options_setAtomicFlush(JNIEnv*, jobject, jlong jhandle, * Method: atomicFlush * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_atomicFlush(JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_atomicFlush(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->atomic_flush); } @@ -2431,7 +2416,7 @@ jboolean Java_org_rocksdb_Options_atomicFlush(JNIEnv*, jobject, jlong jhandle) { * Method: tableFactoryName * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_Options_tableFactoryName(JNIEnv* env, jobject, +jstring Java_org_rocksdb_Options_tableFactoryName(JNIEnv* env, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::TableFactory* tf = opt->table_factory.get(); @@ -2448,7 +2433,7 @@ jstring Java_org_rocksdb_Options_tableFactoryName(JNIEnv* env, jobject, * Method: minWriteBufferNumberToMerge * Signature: (J)I */ -jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->min_write_buffer_number_to_merge; @@ -2460,7 +2445,7 @@ jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge( - JNIEnv*, jobject, jlong jhandle, jint jmin_write_buffer_number_to_merge) { + JNIEnv*, jclass, jlong jhandle, jint jmin_write_buffer_number_to_merge) { reinterpret_cast(jhandle) ->min_write_buffer_number_to_merge = static_cast(jmin_write_buffer_number_to_merge); @@ -2470,7 +2455,7 @@ void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge( * Method: maxWriteBufferNumberToMaintain * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_write_buffer_number_to_maintain; @@ -2482,8 +2467,7 @@ jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setMaxWriteBufferNumberToMaintain( - JNIEnv*, jobject, jlong jhandle, - jint jmax_write_buffer_number_to_maintain) { + JNIEnv*, jclass, jlong jhandle, jint jmax_write_buffer_number_to_maintain) { reinterpret_cast(jhandle) ->max_write_buffer_number_to_maintain = static_cast(jmax_write_buffer_number_to_maintain); @@ -2495,7 +2479,7 @@ void Java_org_rocksdb_Options_setMaxWriteBufferNumberToMaintain( * Signature: (JB)V */ void Java_org_rocksdb_Options_setCompressionType( - JNIEnv*, jobject, jlong jhandle, jbyte jcompression_type_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jcompression_type_value) { auto* opts = reinterpret_cast(jhandle); opts->compression = ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType( @@ -2507,8 +2491,7 @@ void Java_org_rocksdb_Options_setCompressionType( * Method: compressionType * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_compressionType(JNIEnv*, jobject, - jlong jhandle) { +jbyte Java_org_rocksdb_Options_compressionType(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType( opts->compression); @@ -2599,7 +2582,7 @@ jbyteArray rocksdb_compression_list_helper( * Signature: (J[B)V */ void Java_org_rocksdb_Options_setCompressionPerLevel( - JNIEnv* env, jobject, jlong jhandle, jbyteArray jcompressionLevels) { + JNIEnv* env, jclass, jlong jhandle, jbyteArray jcompressionLevels) { auto uptr_compression_levels = rocksdb_compression_vector_helper(env, jcompressionLevels); if (!uptr_compression_levels) { @@ -2615,7 +2598,7 @@ void Java_org_rocksdb_Options_setCompressionPerLevel( * Method: compressionPerLevel * Signature: (J)[B */ -jbyteArray Java_org_rocksdb_Options_compressionPerLevel(JNIEnv* env, jobject, +jbyteArray Java_org_rocksdb_Options_compressionPerLevel(JNIEnv* env, jclass, jlong jhandle) { auto* options = reinterpret_cast(jhandle); return rocksdb_compression_list_helper(env, options->compression_per_level); @@ -2627,7 +2610,7 @@ jbyteArray Java_org_rocksdb_Options_compressionPerLevel(JNIEnv* env, jobject, * Signature: (JB)V */ void Java_org_rocksdb_Options_setBottommostCompressionType( - JNIEnv*, jobject, jlong jhandle, jbyte jcompression_type_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jcompression_type_value) { auto* options = reinterpret_cast(jhandle); options->bottommost_compression = ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType( @@ -2639,7 +2622,7 @@ void Java_org_rocksdb_Options_setBottommostCompressionType( * Method: bottommostCompressionType * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_bottommostCompressionType(JNIEnv*, jobject, +jbyte Java_org_rocksdb_Options_bottommostCompressionType(JNIEnv*, jclass, jlong jhandle) { auto* options = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType( @@ -2652,7 +2635,7 @@ jbyte Java_org_rocksdb_Options_bottommostCompressionType(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setBottommostCompressionOptions( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jlong jbottommost_compression_options_handle) { auto* options = reinterpret_cast(jhandle); auto* bottommost_compression_options = @@ -2667,7 +2650,7 @@ void Java_org_rocksdb_Options_setBottommostCompressionOptions( * Signature: (JJ)V */ void Java_org_rocksdb_Options_setCompressionOptions( - JNIEnv*, jobject, jlong jhandle, jlong jcompression_options_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jcompression_options_handle) { auto* options = reinterpret_cast(jhandle); auto* compression_options = reinterpret_cast( @@ -2680,8 +2663,7 @@ void Java_org_rocksdb_Options_setCompressionOptions( * Method: setCompactionStyle * Signature: (JB)V */ -void Java_org_rocksdb_Options_setCompactionStyle(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_Options_setCompactionStyle(JNIEnv*, jclass, jlong jhandle, jbyte jcompaction_style) { auto* options = reinterpret_cast(jhandle); options->compaction_style = @@ -2694,8 +2676,7 @@ void Java_org_rocksdb_Options_setCompactionStyle(JNIEnv*, jobject, * Method: compactionStyle * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_compactionStyle(JNIEnv*, jobject, - jlong jhandle) { +jbyte Java_org_rocksdb_Options_compactionStyle(JNIEnv*, jclass, jlong jhandle) { auto* options = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompactionStyleJni::toJavaCompactionStyle( options->compaction_style); @@ -2707,7 +2688,7 @@ jbyte Java_org_rocksdb_Options_compactionStyle(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setMaxTableFilesSizeFIFO( - JNIEnv*, jobject, jlong jhandle, jlong jmax_table_files_size) { + JNIEnv*, jclass, jlong jhandle, jlong jmax_table_files_size) { reinterpret_cast(jhandle) ->compaction_options_fifo.max_table_files_size = static_cast(jmax_table_files_size); @@ -2718,7 +2699,7 @@ void Java_org_rocksdb_Options_setMaxTableFilesSizeFIFO( * Method: maxTableFilesSizeFIFO * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxTableFilesSizeFIFO(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_maxTableFilesSizeFIFO(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->compaction_options_fifo.max_table_files_size; @@ -2729,7 +2710,7 @@ jlong Java_org_rocksdb_Options_maxTableFilesSizeFIFO(JNIEnv*, jobject, * Method: numLevels * Signature: (J)I */ -jint Java_org_rocksdb_Options_numLevels(JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_numLevels(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->num_levels; } @@ -2738,7 +2719,7 @@ jint Java_org_rocksdb_Options_numLevels(JNIEnv*, jobject, jlong jhandle) { * Method: setNumLevels * Signature: (JI)V */ -void Java_org_rocksdb_Options_setNumLevels(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setNumLevels(JNIEnv*, jclass, jlong jhandle, jint jnum_levels) { reinterpret_cast(jhandle)->num_levels = static_cast(jnum_levels); @@ -2749,8 +2730,7 @@ void Java_org_rocksdb_Options_setNumLevels(JNIEnv*, jobject, jlong jhandle, * Method: levelZeroFileNumCompactionTrigger * Signature: (J)I */ -jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger(JNIEnv*, - jobject, +jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger; @@ -2762,7 +2742,7 @@ jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger(JNIEnv*, * Signature: (JI)V */ void Java_org_rocksdb_Options_setLevelZeroFileNumCompactionTrigger( - JNIEnv*, jobject, jlong jhandle, jint jlevel0_file_num_compaction_trigger) { + JNIEnv*, jclass, jlong jhandle, jint jlevel0_file_num_compaction_trigger) { reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger = static_cast(jlevel0_file_num_compaction_trigger); @@ -2773,7 +2753,7 @@ void Java_org_rocksdb_Options_setLevelZeroFileNumCompactionTrigger( * Method: levelZeroSlowdownWritesTrigger * Signature: (J)I */ -jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level0_slowdown_writes_trigger; @@ -2785,7 +2765,7 @@ jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setLevelZeroSlowdownWritesTrigger( - JNIEnv*, jobject, jlong jhandle, jint jlevel0_slowdown_writes_trigger) { + JNIEnv*, jclass, jlong jhandle, jint jlevel0_slowdown_writes_trigger) { reinterpret_cast(jhandle) ->level0_slowdown_writes_trigger = static_cast(jlevel0_slowdown_writes_trigger); @@ -2796,7 +2776,7 @@ void Java_org_rocksdb_Options_setLevelZeroSlowdownWritesTrigger( * Method: levelZeroStopWritesTrigger * Signature: (J)I */ -jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level0_stop_writes_trigger; @@ -2808,7 +2788,7 @@ jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setLevelZeroStopWritesTrigger( - JNIEnv*, jobject, jlong jhandle, jint jlevel0_stop_writes_trigger) { + JNIEnv*, jclass, jlong jhandle, jint jlevel0_stop_writes_trigger) { reinterpret_cast(jhandle) ->level0_stop_writes_trigger = static_cast(jlevel0_stop_writes_trigger); @@ -2819,7 +2799,7 @@ void Java_org_rocksdb_Options_setLevelZeroStopWritesTrigger( * Method: targetFileSizeBase * Signature: (J)J */ -jlong Java_org_rocksdb_Options_targetFileSizeBase(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_targetFileSizeBase(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->target_file_size_base; @@ -2831,7 +2811,7 @@ jlong Java_org_rocksdb_Options_targetFileSizeBase(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setTargetFileSizeBase( - JNIEnv*, jobject, jlong jhandle, jlong jtarget_file_size_base) { + JNIEnv*, jclass, jlong jhandle, jlong jtarget_file_size_base) { reinterpret_cast(jhandle) ->target_file_size_base = static_cast(jtarget_file_size_base); } @@ -2841,7 +2821,7 @@ void Java_org_rocksdb_Options_setTargetFileSizeBase( * Method: targetFileSizeMultiplier * Signature: (J)I */ -jint Java_org_rocksdb_Options_targetFileSizeMultiplier(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_targetFileSizeMultiplier(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->target_file_size_multiplier; @@ -2853,7 +2833,7 @@ jint Java_org_rocksdb_Options_targetFileSizeMultiplier(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setTargetFileSizeMultiplier( - JNIEnv*, jobject, jlong jhandle, jint jtarget_file_size_multiplier) { + JNIEnv*, jclass, jlong jhandle, jint jtarget_file_size_multiplier) { reinterpret_cast(jhandle) ->target_file_size_multiplier = static_cast(jtarget_file_size_multiplier); @@ -2864,7 +2844,7 @@ void Java_org_rocksdb_Options_setTargetFileSizeMultiplier( * Method: maxBytesForLevelBase * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxBytesForLevelBase(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_maxBytesForLevelBase(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_bytes_for_level_base; @@ -2876,7 +2856,7 @@ jlong Java_org_rocksdb_Options_maxBytesForLevelBase(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setMaxBytesForLevelBase( - JNIEnv*, jobject, jlong jhandle, jlong jmax_bytes_for_level_base) { + JNIEnv*, jclass, jlong jhandle, jlong jmax_bytes_for_level_base) { reinterpret_cast(jhandle) ->max_bytes_for_level_base = static_cast(jmax_bytes_for_level_base); @@ -2888,7 +2868,7 @@ void Java_org_rocksdb_Options_setMaxBytesForLevelBase( * Signature: (J)Z */ jboolean Java_org_rocksdb_Options_levelCompactionDynamicLevelBytes( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level_compaction_dynamic_level_bytes; } @@ -2899,7 +2879,7 @@ jboolean Java_org_rocksdb_Options_levelCompactionDynamicLevelBytes( * Signature: (JZ)V */ void Java_org_rocksdb_Options_setLevelCompactionDynamicLevelBytes( - JNIEnv*, jobject, jlong jhandle, jboolean jenable_dynamic_level_bytes) { + JNIEnv*, jclass, jlong jhandle, jboolean jenable_dynamic_level_bytes) { reinterpret_cast(jhandle) ->level_compaction_dynamic_level_bytes = (jenable_dynamic_level_bytes); } @@ -2909,7 +2889,7 @@ void Java_org_rocksdb_Options_setLevelCompactionDynamicLevelBytes( * Method: maxBytesForLevelMultiplier * Signature: (J)D */ -jdouble Java_org_rocksdb_Options_maxBytesForLevelMultiplier(JNIEnv*, jobject, +jdouble Java_org_rocksdb_Options_maxBytesForLevelMultiplier(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_bytes_for_level_multiplier; @@ -2921,7 +2901,7 @@ jdouble Java_org_rocksdb_Options_maxBytesForLevelMultiplier(JNIEnv*, jobject, * Signature: (JD)V */ void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplier( - JNIEnv*, jobject, jlong jhandle, jdouble jmax_bytes_for_level_multiplier) { + JNIEnv*, jclass, jlong jhandle, jdouble jmax_bytes_for_level_multiplier) { reinterpret_cast(jhandle) ->max_bytes_for_level_multiplier = static_cast(jmax_bytes_for_level_multiplier); @@ -2932,7 +2912,7 @@ void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplier( * Method: maxCompactionBytes * Signature: (J)I */ -jlong Java_org_rocksdb_Options_maxCompactionBytes(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_maxCompactionBytes(JNIEnv*, jclass, jlong jhandle) { return static_cast( reinterpret_cast(jhandle) @@ -2945,7 +2925,7 @@ jlong Java_org_rocksdb_Options_maxCompactionBytes(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setMaxCompactionBytes( - JNIEnv*, jobject, jlong jhandle, jlong jmax_compaction_bytes) { + JNIEnv*, jclass, jlong jhandle, jlong jmax_compaction_bytes) { reinterpret_cast(jhandle)->max_compaction_bytes = static_cast(jmax_compaction_bytes); } @@ -2955,7 +2935,7 @@ void Java_org_rocksdb_Options_setMaxCompactionBytes( * Method: arenaBlockSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_arenaBlockSize(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_arenaBlockSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->arena_block_size; } @@ -2965,7 +2945,7 @@ jlong Java_org_rocksdb_Options_arenaBlockSize(JNIEnv*, jobject, jlong jhandle) { * Method: setArenaBlockSize * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setArenaBlockSize(JNIEnv* env, jobject, +void Java_org_rocksdb_Options_setArenaBlockSize(JNIEnv* env, jclass, jlong jhandle, jlong jarena_block_size) { auto s = @@ -2983,7 +2963,7 @@ void Java_org_rocksdb_Options_setArenaBlockSize(JNIEnv* env, jobject, * Method: disableAutoCompactions * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_disableAutoCompactions(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_disableAutoCompactions(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->disable_auto_compactions; @@ -2995,7 +2975,7 @@ jboolean Java_org_rocksdb_Options_disableAutoCompactions(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setDisableAutoCompactions( - JNIEnv*, jobject, jlong jhandle, jboolean jdisable_auto_compactions) { + JNIEnv*, jclass, jlong jhandle, jboolean jdisable_auto_compactions) { reinterpret_cast(jhandle) ->disable_auto_compactions = static_cast(jdisable_auto_compactions); } @@ -3005,7 +2985,7 @@ void Java_org_rocksdb_Options_setDisableAutoCompactions( * Method: maxSequentialSkipInIterations * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_sequential_skip_in_iterations; @@ -3017,7 +2997,7 @@ jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setMaxSequentialSkipInIterations( - JNIEnv*, jobject, jlong jhandle, jlong jmax_sequential_skip_in_iterations) { + JNIEnv*, jclass, jlong jhandle, jlong jmax_sequential_skip_in_iterations) { reinterpret_cast(jhandle) ->max_sequential_skip_in_iterations = static_cast(jmax_sequential_skip_in_iterations); @@ -3028,7 +3008,7 @@ void Java_org_rocksdb_Options_setMaxSequentialSkipInIterations( * Method: inplaceUpdateSupport * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_inplaceUpdateSupport(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_inplaceUpdateSupport(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->inplace_update_support; @@ -3040,7 +3020,7 @@ jboolean Java_org_rocksdb_Options_inplaceUpdateSupport(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setInplaceUpdateSupport( - JNIEnv*, jobject, jlong jhandle, jboolean jinplace_update_support) { + JNIEnv*, jclass, jlong jhandle, jboolean jinplace_update_support) { reinterpret_cast(jhandle) ->inplace_update_support = static_cast(jinplace_update_support); } @@ -3050,7 +3030,7 @@ void Java_org_rocksdb_Options_setInplaceUpdateSupport( * Method: inplaceUpdateNumLocks * Signature: (J)J */ -jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->inplace_update_num_locks; @@ -3062,7 +3042,7 @@ jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setInplaceUpdateNumLocks( - JNIEnv* env, jobject, jlong jhandle, jlong jinplace_update_num_locks) { + JNIEnv* env, jclass, jlong jhandle, jlong jinplace_update_num_locks) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( jinplace_update_num_locks); if (s.ok()) { @@ -3078,7 +3058,7 @@ void Java_org_rocksdb_Options_setInplaceUpdateNumLocks( * Method: memtablePrefixBloomSizeRatio * Signature: (J)I */ -jdouble Java_org_rocksdb_Options_memtablePrefixBloomSizeRatio(JNIEnv*, jobject, +jdouble Java_org_rocksdb_Options_memtablePrefixBloomSizeRatio(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->memtable_prefix_bloom_size_ratio; @@ -3090,8 +3070,7 @@ jdouble Java_org_rocksdb_Options_memtablePrefixBloomSizeRatio(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setMemtablePrefixBloomSizeRatio( - JNIEnv*, jobject, jlong jhandle, - jdouble jmemtable_prefix_bloom_size_ratio) { + JNIEnv*, jclass, jlong jhandle, jdouble jmemtable_prefix_bloom_size_ratio) { reinterpret_cast(jhandle) ->memtable_prefix_bloom_size_ratio = static_cast(jmemtable_prefix_bloom_size_ratio); @@ -3102,7 +3081,7 @@ void Java_org_rocksdb_Options_setMemtablePrefixBloomSizeRatio( * Method: experimentalMempurgeThreshold * Signature: (J)I */ -jdouble Java_org_rocksdb_Options_experimentalMempurgeThreshold(JNIEnv*, jobject, +jdouble Java_org_rocksdb_Options_experimentalMempurgeThreshold(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->experimental_mempurge_threshold; @@ -3114,7 +3093,7 @@ jdouble Java_org_rocksdb_Options_experimentalMempurgeThreshold(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setExperimentalMempurgeThreshold( - JNIEnv*, jobject, jlong jhandle, jdouble jexperimental_mempurge_threshold) { + JNIEnv*, jclass, jlong jhandle, jdouble jexperimental_mempurge_threshold) { reinterpret_cast(jhandle) ->experimental_mempurge_threshold = static_cast(jexperimental_mempurge_threshold); @@ -3125,7 +3104,7 @@ void Java_org_rocksdb_Options_setExperimentalMempurgeThreshold( * Method: memtableWholeKeyFiltering * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_memtableWholeKeyFiltering(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_memtableWholeKeyFiltering(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->memtable_whole_key_filtering; @@ -3137,7 +3116,7 @@ jboolean Java_org_rocksdb_Options_memtableWholeKeyFiltering(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setMemtableWholeKeyFiltering( - JNIEnv*, jobject, jlong jhandle, jboolean jmemtable_whole_key_filtering) { + JNIEnv*, jclass, jlong jhandle, jboolean jmemtable_whole_key_filtering) { reinterpret_cast(jhandle) ->memtable_whole_key_filtering = static_cast(jmemtable_whole_key_filtering); @@ -3148,7 +3127,7 @@ void Java_org_rocksdb_Options_setMemtableWholeKeyFiltering( * Method: bloomLocality * Signature: (J)I */ -jint Java_org_rocksdb_Options_bloomLocality(JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_bloomLocality(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->bloom_locality; } @@ -3157,7 +3136,7 @@ jint Java_org_rocksdb_Options_bloomLocality(JNIEnv*, jobject, jlong jhandle) { * Method: setBloomLocality * Signature: (JI)V */ -void Java_org_rocksdb_Options_setBloomLocality(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setBloomLocality(JNIEnv*, jclass, jlong jhandle, jint jbloom_locality) { reinterpret_cast(jhandle)->bloom_locality = static_cast(jbloom_locality); @@ -3168,7 +3147,7 @@ void Java_org_rocksdb_Options_setBloomLocality(JNIEnv*, jobject, jlong jhandle, * Method: maxSuccessiveMerges * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxSuccessiveMerges(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_maxSuccessiveMerges(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_successive_merges; @@ -3180,7 +3159,7 @@ jlong Java_org_rocksdb_Options_maxSuccessiveMerges(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setMaxSuccessiveMerges( - JNIEnv* env, jobject, jlong jhandle, jlong jmax_successive_merges) { + JNIEnv* env, jclass, jlong jhandle, jlong jmax_successive_merges) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( jmax_successive_merges); if (s.ok()) { @@ -3196,7 +3175,7 @@ void Java_org_rocksdb_Options_setMaxSuccessiveMerges( * Method: optimizeFiltersForHits * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_optimizeFiltersForHits(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_optimizeFiltersForHits(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->optimize_filters_for_hits; @@ -3208,7 +3187,7 @@ jboolean Java_org_rocksdb_Options_optimizeFiltersForHits(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setOptimizeFiltersForHits( - JNIEnv*, jobject, jlong jhandle, jboolean joptimize_filters_for_hits) { + JNIEnv*, jclass, jlong jhandle, jboolean joptimize_filters_for_hits) { reinterpret_cast(jhandle) ->optimize_filters_for_hits = static_cast(joptimize_filters_for_hits); @@ -3231,7 +3210,7 @@ void Java_org_rocksdb_Options_oldDefaults(JNIEnv*, jclass, jlong jhandle, * Method: optimizeForSmallDb * Signature: (J)V */ -void Java_org_rocksdb_Options_optimizeForSmallDb__J(JNIEnv*, jobject, +void Java_org_rocksdb_Options_optimizeForSmallDb__J(JNIEnv*, jclass, jlong jhandle) { reinterpret_cast(jhandle)->OptimizeForSmallDb(); } @@ -3259,7 +3238,7 @@ void Java_org_rocksdb_Options_optimizeForSmallDb__JJ(JNIEnv*, jclass, * Signature: (JJ)V */ void Java_org_rocksdb_Options_optimizeForPointLookup( - JNIEnv*, jobject, jlong jhandle, jlong block_cache_size_mb) { + JNIEnv*, jclass, jlong jhandle, jlong block_cache_size_mb) { reinterpret_cast(jhandle) ->OptimizeForPointLookup(block_cache_size_mb); } @@ -3270,7 +3249,7 @@ void Java_org_rocksdb_Options_optimizeForPointLookup( * Signature: (JJ)V */ void Java_org_rocksdb_Options_optimizeLevelStyleCompaction( - JNIEnv*, jobject, jlong jhandle, jlong memtable_memory_budget) { + JNIEnv*, jclass, jlong jhandle, jlong memtable_memory_budget) { reinterpret_cast(jhandle) ->OptimizeLevelStyleCompaction(memtable_memory_budget); } @@ -3281,7 +3260,7 @@ void Java_org_rocksdb_Options_optimizeLevelStyleCompaction( * Signature: (JJ)V */ void Java_org_rocksdb_Options_optimizeUniversalStyleCompaction( - JNIEnv*, jobject, jlong jhandle, jlong memtable_memory_budget) { + JNIEnv*, jclass, jlong jhandle, jlong memtable_memory_budget) { reinterpret_cast(jhandle) ->OptimizeUniversalStyleCompaction(memtable_memory_budget); } @@ -3291,7 +3270,7 @@ void Java_org_rocksdb_Options_optimizeUniversalStyleCompaction( * Method: prepareForBulkLoad * Signature: (J)V */ -void Java_org_rocksdb_Options_prepareForBulkLoad(JNIEnv*, jobject, +void Java_org_rocksdb_Options_prepareForBulkLoad(JNIEnv*, jclass, jlong jhandle) { reinterpret_cast(jhandle)->PrepareForBulkLoad(); } @@ -3301,7 +3280,7 @@ void Java_org_rocksdb_Options_prepareForBulkLoad(JNIEnv*, jobject, * Method: memtableHugePageSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_memtableHugePageSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_memtableHugePageSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->memtable_huge_page_size; @@ -3313,7 +3292,7 @@ jlong Java_org_rocksdb_Options_memtableHugePageSize(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setMemtableHugePageSize( - JNIEnv* env, jobject, jlong jhandle, jlong jmemtable_huge_page_size) { + JNIEnv* env, jclass, jlong jhandle, jlong jmemtable_huge_page_size) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( jmemtable_huge_page_size); if (s.ok()) { @@ -3329,7 +3308,7 @@ void Java_org_rocksdb_Options_setMemtableHugePageSize( * Method: softPendingCompactionBytesLimit * Signature: (J)J */ -jlong Java_org_rocksdb_Options_softPendingCompactionBytesLimit(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_softPendingCompactionBytesLimit(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->soft_pending_compaction_bytes_limit; @@ -3341,7 +3320,7 @@ jlong Java_org_rocksdb_Options_softPendingCompactionBytesLimit(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setSoftPendingCompactionBytesLimit( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jlong jsoft_pending_compaction_bytes_limit) { reinterpret_cast(jhandle) ->soft_pending_compaction_bytes_limit = @@ -3353,7 +3332,7 @@ void Java_org_rocksdb_Options_setSoftPendingCompactionBytesLimit( * Method: softHardCompactionBytesLimit * Signature: (J)J */ -jlong Java_org_rocksdb_Options_hardPendingCompactionBytesLimit(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_hardPendingCompactionBytesLimit(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->hard_pending_compaction_bytes_limit; @@ -3365,7 +3344,7 @@ jlong Java_org_rocksdb_Options_hardPendingCompactionBytesLimit(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setHardPendingCompactionBytesLimit( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jlong jhard_pending_compaction_bytes_limit) { reinterpret_cast(jhandle) ->hard_pending_compaction_bytes_limit = @@ -3377,7 +3356,7 @@ void Java_org_rocksdb_Options_setHardPendingCompactionBytesLimit( * Method: level0FileNumCompactionTrigger * Signature: (J)I */ -jint Java_org_rocksdb_Options_level0FileNumCompactionTrigger(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_level0FileNumCompactionTrigger(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger; @@ -3389,7 +3368,7 @@ jint Java_org_rocksdb_Options_level0FileNumCompactionTrigger(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setLevel0FileNumCompactionTrigger( - JNIEnv*, jobject, jlong jhandle, jint jlevel0_file_num_compaction_trigger) { + JNIEnv*, jclass, jlong jhandle, jint jlevel0_file_num_compaction_trigger) { reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger = static_cast(jlevel0_file_num_compaction_trigger); @@ -3400,7 +3379,7 @@ void Java_org_rocksdb_Options_setLevel0FileNumCompactionTrigger( * Method: level0SlowdownWritesTrigger * Signature: (J)I */ -jint Java_org_rocksdb_Options_level0SlowdownWritesTrigger(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_level0SlowdownWritesTrigger(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level0_slowdown_writes_trigger; @@ -3412,7 +3391,7 @@ jint Java_org_rocksdb_Options_level0SlowdownWritesTrigger(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setLevel0SlowdownWritesTrigger( - JNIEnv*, jobject, jlong jhandle, jint jlevel0_slowdown_writes_trigger) { + JNIEnv*, jclass, jlong jhandle, jint jlevel0_slowdown_writes_trigger) { reinterpret_cast(jhandle) ->level0_slowdown_writes_trigger = static_cast(jlevel0_slowdown_writes_trigger); @@ -3423,7 +3402,7 @@ void Java_org_rocksdb_Options_setLevel0SlowdownWritesTrigger( * Method: level0StopWritesTrigger * Signature: (J)I */ -jint Java_org_rocksdb_Options_level0StopWritesTrigger(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_level0StopWritesTrigger(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level0_stop_writes_trigger; @@ -3435,7 +3414,7 @@ jint Java_org_rocksdb_Options_level0StopWritesTrigger(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setLevel0StopWritesTrigger( - JNIEnv*, jobject, jlong jhandle, jint jlevel0_stop_writes_trigger) { + JNIEnv*, jclass, jlong jhandle, jint jlevel0_stop_writes_trigger) { reinterpret_cast(jhandle) ->level0_stop_writes_trigger = static_cast(jlevel0_stop_writes_trigger); @@ -3447,7 +3426,7 @@ void Java_org_rocksdb_Options_setLevel0StopWritesTrigger( * Signature: (J)[I */ jintArray Java_org_rocksdb_Options_maxBytesForLevelMultiplierAdditional( - JNIEnv* env, jobject, jlong jhandle) { + JNIEnv* env, jclass, jlong jhandle) { auto mbflma = reinterpret_cast(jhandle) ->max_bytes_for_level_multiplier_additional; @@ -3485,7 +3464,7 @@ jintArray Java_org_rocksdb_Options_maxBytesForLevelMultiplierAdditional( * Signature: (J[I)V */ void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplierAdditional( - JNIEnv* env, jobject, jlong jhandle, + JNIEnv* env, jclass, jlong jhandle, jintArray jmax_bytes_for_level_multiplier_additional) { jsize len = env->GetArrayLength(jmax_bytes_for_level_multiplier_additional); jint* additionals = env->GetIntArrayElements( @@ -3511,7 +3490,7 @@ void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplierAdditional( * Method: paranoidFileChecks * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_paranoidFileChecks(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_paranoidFileChecks(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->paranoid_file_checks; @@ -3523,7 +3502,7 @@ jboolean Java_org_rocksdb_Options_paranoidFileChecks(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setParanoidFileChecks( - JNIEnv*, jobject, jlong jhandle, jboolean jparanoid_file_checks) { + JNIEnv*, jclass, jlong jhandle, jboolean jparanoid_file_checks) { reinterpret_cast(jhandle)->paranoid_file_checks = static_cast(jparanoid_file_checks); } @@ -3534,7 +3513,7 @@ void Java_org_rocksdb_Options_setParanoidFileChecks( * Signature: (JB)V */ void Java_org_rocksdb_Options_setCompactionPriority( - JNIEnv*, jobject, jlong jhandle, jbyte jcompaction_priority_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jcompaction_priority_value) { auto* opts = reinterpret_cast(jhandle); opts->compaction_pri = ROCKSDB_NAMESPACE::CompactionPriorityJni::toCppCompactionPriority( @@ -3546,7 +3525,7 @@ void Java_org_rocksdb_Options_setCompactionPriority( * Method: compactionPriority * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_compactionPriority(JNIEnv*, jobject, +jbyte Java_org_rocksdb_Options_compactionPriority(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompactionPriorityJni::toJavaCompactionPriority( @@ -3558,8 +3537,7 @@ jbyte Java_org_rocksdb_Options_compactionPriority(JNIEnv*, jobject, * Method: setReportBgIoStats * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setReportBgIoStats(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_Options_setReportBgIoStats(JNIEnv*, jclass, jlong jhandle, jboolean jreport_bg_io_stats) { auto* opts = reinterpret_cast(jhandle); opts->report_bg_io_stats = static_cast(jreport_bg_io_stats); @@ -3570,7 +3548,7 @@ void Java_org_rocksdb_Options_setReportBgIoStats(JNIEnv*, jobject, * Method: reportBgIoStats * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_reportBgIoStats(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_reportBgIoStats(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->report_bg_io_stats); @@ -3581,7 +3559,7 @@ jboolean Java_org_rocksdb_Options_reportBgIoStats(JNIEnv*, jobject, * Method: setTtl * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setTtl(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setTtl(JNIEnv*, jclass, jlong jhandle, jlong jttl) { auto* opts = reinterpret_cast(jhandle); opts->ttl = static_cast(jttl); @@ -3592,7 +3570,7 @@ void Java_org_rocksdb_Options_setTtl(JNIEnv*, jobject, jlong jhandle, * Method: ttl * Signature: (J)J */ -jlong Java_org_rocksdb_Options_ttl(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_ttl(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->ttl); } @@ -3603,7 +3581,7 @@ jlong Java_org_rocksdb_Options_ttl(JNIEnv*, jobject, jlong jhandle) { * Signature: (JJ)V */ void Java_org_rocksdb_Options_setPeriodicCompactionSeconds( - JNIEnv*, jobject, jlong jhandle, jlong jperiodicCompactionSeconds) { + JNIEnv*, jclass, jlong jhandle, jlong jperiodicCompactionSeconds) { auto* opts = reinterpret_cast(jhandle); opts->periodic_compaction_seconds = static_cast(jperiodicCompactionSeconds); @@ -3614,7 +3592,7 @@ void Java_org_rocksdb_Options_setPeriodicCompactionSeconds( * Method: periodicCompactionSeconds * Signature: (J)J */ -jlong Java_org_rocksdb_Options_periodicCompactionSeconds(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_periodicCompactionSeconds(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->periodic_compaction_seconds); @@ -3626,7 +3604,7 @@ jlong Java_org_rocksdb_Options_periodicCompactionSeconds(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_Options_setCompactionOptionsUniversal( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jlong jcompaction_options_universal_handle) { auto* opts = reinterpret_cast(jhandle); auto* opts_uni = @@ -3641,7 +3619,7 @@ void Java_org_rocksdb_Options_setCompactionOptionsUniversal( * Signature: (JJ)V */ void Java_org_rocksdb_Options_setCompactionOptionsFIFO( - JNIEnv*, jobject, jlong jhandle, jlong jcompaction_options_fifo_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jcompaction_options_fifo_handle) { auto* opts = reinterpret_cast(jhandle); auto* opts_fifo = reinterpret_cast( jcompaction_options_fifo_handle); @@ -3654,7 +3632,7 @@ void Java_org_rocksdb_Options_setCompactionOptionsFIFO( * Signature: (JZ)V */ void Java_org_rocksdb_Options_setForceConsistencyChecks( - JNIEnv*, jobject, jlong jhandle, jboolean jforce_consistency_checks) { + JNIEnv*, jclass, jlong jhandle, jboolean jforce_consistency_checks) { auto* opts = reinterpret_cast(jhandle); opts->force_consistency_checks = static_cast(jforce_consistency_checks); } @@ -3664,7 +3642,7 @@ void Java_org_rocksdb_Options_setForceConsistencyChecks( * Method: forceConsistencyChecks * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_forceConsistencyChecks(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_forceConsistencyChecks(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->force_consistency_checks); @@ -3677,8 +3655,7 @@ jboolean Java_org_rocksdb_Options_forceConsistencyChecks(JNIEnv*, jobject, * Method: setEnableBlobFiles * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setEnableBlobFiles(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_Options_setEnableBlobFiles(JNIEnv*, jclass, jlong jhandle, jboolean jenable_blob_files) { auto* opts = reinterpret_cast(jhandle); opts->enable_blob_files = static_cast(jenable_blob_files); @@ -3689,7 +3666,7 @@ void Java_org_rocksdb_Options_setEnableBlobFiles(JNIEnv*, jobject, * Method: enableBlobFiles * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_enableBlobFiles(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_enableBlobFiles(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->enable_blob_files); @@ -3700,7 +3677,7 @@ jboolean Java_org_rocksdb_Options_enableBlobFiles(JNIEnv*, jobject, * Method: setMinBlobSize * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setMinBlobSize(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setMinBlobSize(JNIEnv*, jclass, jlong jhandle, jlong jmin_blob_size) { auto* opts = reinterpret_cast(jhandle); opts->min_blob_size = static_cast(jmin_blob_size); @@ -3711,7 +3688,7 @@ void Java_org_rocksdb_Options_setMinBlobSize(JNIEnv*, jobject, jlong jhandle, * Method: minBlobSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_minBlobSize(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_minBlobSize(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->min_blob_size); } @@ -3721,7 +3698,7 @@ jlong Java_org_rocksdb_Options_minBlobSize(JNIEnv*, jobject, jlong jhandle) { * Method: setBlobFileSize * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setBlobFileSize(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Options_setBlobFileSize(JNIEnv*, jclass, jlong jhandle, jlong jblob_file_size) { auto* opts = reinterpret_cast(jhandle); opts->blob_file_size = static_cast(jblob_file_size); @@ -3732,7 +3709,7 @@ void Java_org_rocksdb_Options_setBlobFileSize(JNIEnv*, jobject, jlong jhandle, * Method: blobFileSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_blobFileSize(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_blobFileSize(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->blob_file_size); } @@ -3743,7 +3720,7 @@ jlong Java_org_rocksdb_Options_blobFileSize(JNIEnv*, jobject, jlong jhandle) { * Signature: (JB)V */ void Java_org_rocksdb_Options_setBlobCompressionType( - JNIEnv*, jobject, jlong jhandle, jbyte jblob_compression_type_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jblob_compression_type_value) { auto* opts = reinterpret_cast(jhandle); opts->blob_compression_type = ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType( @@ -3755,7 +3732,7 @@ void Java_org_rocksdb_Options_setBlobCompressionType( * Method: blobCompressionType * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_blobCompressionType(JNIEnv*, jobject, +jbyte Java_org_rocksdb_Options_blobCompressionType(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType( @@ -3768,7 +3745,7 @@ jbyte Java_org_rocksdb_Options_blobCompressionType(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_Options_setEnableBlobGarbageCollection( - JNIEnv*, jobject, jlong jhandle, jboolean jenable_blob_garbage_collection) { + JNIEnv*, jclass, jlong jhandle, jboolean jenable_blob_garbage_collection) { auto* opts = reinterpret_cast(jhandle); opts->enable_blob_garbage_collection = static_cast(jenable_blob_garbage_collection); @@ -3779,7 +3756,7 @@ void Java_org_rocksdb_Options_setEnableBlobGarbageCollection( * Method: enableBlobGarbageCollection * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_enableBlobGarbageCollection(JNIEnv*, jobject, +jboolean Java_org_rocksdb_Options_enableBlobGarbageCollection(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->enable_blob_garbage_collection); @@ -3791,7 +3768,7 @@ jboolean Java_org_rocksdb_Options_enableBlobGarbageCollection(JNIEnv*, jobject, * Signature: (JD)V */ void Java_org_rocksdb_Options_setBlobGarbageCollectionAgeCutoff( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jdouble jblob_garbage_collection_age_cutoff) { auto* opts = reinterpret_cast(jhandle); opts->blob_garbage_collection_age_cutoff = @@ -3803,8 +3780,7 @@ void Java_org_rocksdb_Options_setBlobGarbageCollectionAgeCutoff( * Method: blobGarbageCollectionAgeCutoff * Signature: (J)D */ -jdouble Java_org_rocksdb_Options_blobGarbageCollectionAgeCutoff(JNIEnv*, - jobject, +jdouble Java_org_rocksdb_Options_blobGarbageCollectionAgeCutoff(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->blob_garbage_collection_age_cutoff); @@ -3816,7 +3792,7 @@ jdouble Java_org_rocksdb_Options_blobGarbageCollectionAgeCutoff(JNIEnv*, * Signature: (JD)V */ void Java_org_rocksdb_Options_setBlobGarbageCollectionForceThreshold( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jdouble jblob_garbage_collection_force_threshold) { auto* opts = reinterpret_cast(jhandle); opts->blob_garbage_collection_force_threshold = @@ -3829,7 +3805,7 @@ void Java_org_rocksdb_Options_setBlobGarbageCollectionForceThreshold( * Signature: (J)D */ jdouble Java_org_rocksdb_Options_blobGarbageCollectionForceThreshold( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->blob_garbage_collection_force_threshold); } @@ -3840,7 +3816,7 @@ jdouble Java_org_rocksdb_Options_blobGarbageCollectionForceThreshold( * Signature: (JJ)V */ void Java_org_rocksdb_Options_setBlobCompactionReadaheadSize( - JNIEnv*, jobject, jlong jhandle, jlong jblob_compaction_readahead_size) { + JNIEnv*, jclass, jlong jhandle, jlong jblob_compaction_readahead_size) { auto* opts = reinterpret_cast(jhandle); opts->blob_compaction_readahead_size = static_cast(jblob_compaction_readahead_size); @@ -3851,7 +3827,7 @@ void Java_org_rocksdb_Options_setBlobCompactionReadaheadSize( * Method: blobCompactionReadaheadSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_blobCompactionReadaheadSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_Options_blobCompactionReadaheadSize(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->blob_compaction_readahead_size); @@ -3863,7 +3839,7 @@ jlong Java_org_rocksdb_Options_blobCompactionReadaheadSize(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setBlobFileStartingLevel( - JNIEnv*, jobject, jlong jhandle, jint jblob_file_starting_level) { + JNIEnv*, jclass, jlong jhandle, jint jblob_file_starting_level) { auto* opts = reinterpret_cast(jhandle); opts->blob_file_starting_level = jblob_file_starting_level; } @@ -3873,7 +3849,7 @@ void Java_org_rocksdb_Options_setBlobFileStartingLevel( * Method: blobFileStartingLevel * Signature: (J)I */ -jint Java_org_rocksdb_Options_blobFileStartingLevel(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_blobFileStartingLevel(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->blob_file_starting_level); @@ -3885,7 +3861,7 @@ jint Java_org_rocksdb_Options_blobFileStartingLevel(JNIEnv*, jobject, * Signature: (JB)V */ void Java_org_rocksdb_Options_setPrepopulateBlobCache( - JNIEnv*, jobject, jlong jhandle, jbyte jprepopulate_blob_cache_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jprepopulate_blob_cache_value) { auto* opts = reinterpret_cast(jhandle); opts->prepopulate_blob_cache = ROCKSDB_NAMESPACE::PrepopulateBlobCacheJni::toCppPrepopulateBlobCache( @@ -3897,7 +3873,7 @@ void Java_org_rocksdb_Options_setPrepopulateBlobCache( * Method: prepopulateBlobCache * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_prepopulateBlobCache(JNIEnv*, jobject, +jbyte Java_org_rocksdb_Options_prepopulateBlobCache(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::PrepopulateBlobCacheJni::toJavaPrepopulateBlobCache( @@ -3910,7 +3886,7 @@ jbyte Java_org_rocksdb_Options_prepopulateBlobCache(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_Options_setMemtableMaxRangeDeletions( - JNIEnv*, jobject, jlong jhandle, jint jmemtable_max_range_deletions) { + JNIEnv*, jclass, jlong jhandle, jint jmemtable_max_range_deletions) { auto* opts = reinterpret_cast(jhandle); opts->memtable_max_range_deletions = static_cast(jmemtable_max_range_deletions); @@ -3921,12 +3897,68 @@ void Java_org_rocksdb_Options_setMemtableMaxRangeDeletions( * Method: memtableMaxRangeDeletions * Signature: (J)I */ -jint Java_org_rocksdb_Options_memtableMaxRangeDeletions(JNIEnv*, jobject, +jint Java_org_rocksdb_Options_memtableMaxRangeDeletions(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->memtable_max_range_deletions); } +/* + * Class: org_rocksdb_Options + * Method: tablePropertiesCollectorFactory + * Signature: (J)[J + */ +jlongArray Java_org_rocksdb_Options_tablePropertiesCollectorFactory( + JNIEnv* env, jclass, jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + const size_t size = opt->table_properties_collector_factories.size(); + jlongArray retVal = env->NewLongArray(static_cast(size)); + if (retVal == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + jlong* buf = env->GetLongArrayElements(retVal, NULL); + if (buf == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + for (size_t i = 0; i < size; i++) { + auto* wrapper = new TablePropertiesCollectorFactoriesJniWrapper(); + wrapper->table_properties_collector_factories = + opt->table_properties_collector_factories[i]; + buf[i] = GET_CPLUSPLUS_POINTER(wrapper); + } + env->ReleaseLongArrayElements(retVal, buf, 0); + return retVal; +} + +/* + * Class: org_rocksdb_Options + * Method: setTablePropertiesCollectorFactory + * Signature: (J[J)V + */ +void Java_org_rocksdb_Options_setTablePropertiesCollectorFactory( + JNIEnv* env, jclass, jlong jhandle, jlongArray j_factory_handles) { + auto* opt = reinterpret_cast(jhandle); + const jsize size = env->GetArrayLength(j_factory_handles); + + jlong* buf = env->GetLongArrayElements(j_factory_handles, NULL); + if (buf == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + + opt->table_properties_collector_factories.clear(); + for (jsize i = 0; i < size; i++) { + auto* wrapper = + reinterpret_cast(buf[i]); + opt->table_properties_collector_factories.emplace_back( + wrapper->table_properties_collector_factories); + } + env->ReleaseLongArrayElements(j_factory_handles, buf, JNI_ABORT); +} + ////////////////////////////////////////////////////////////////////////////// // ROCKSDB_NAMESPACE::ColumnFamilyOptions @@ -4040,8 +4072,8 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps__Ljav * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_ColumnFamilyOptions_disposeInternal(JNIEnv*, jobject, - jlong handle) { +void Java_org_rocksdb_ColumnFamilyOptions_disposeInternalJni(JNIEnv*, jclass, + jlong handle) { auto* cfo = reinterpret_cast(handle); assert(cfo != nullptr); delete cfo; @@ -4065,8 +4097,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_oldDefaults(JNIEnv*, jclass, * Method: optimizeForSmallDb * Signature: (J)V */ -void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb__J(JNIEnv*, - jobject, +void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb__J(JNIEnv*, jclass, jlong jhandle) { reinterpret_cast(jhandle) ->OptimizeForSmallDb(); @@ -4092,7 +4123,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb__JJ( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_optimizeForPointLookup( - JNIEnv*, jobject, jlong jhandle, jlong block_cache_size_mb) { + JNIEnv*, jclass, jlong jhandle, jlong block_cache_size_mb) { reinterpret_cast(jhandle) ->OptimizeForPointLookup(block_cache_size_mb); } @@ -4103,7 +4134,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_optimizeForPointLookup( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_optimizeLevelStyleCompaction( - JNIEnv*, jobject, jlong jhandle, jlong memtable_memory_budget) { + JNIEnv*, jclass, jlong jhandle, jlong memtable_memory_budget) { reinterpret_cast(jhandle) ->OptimizeLevelStyleCompaction(memtable_memory_budget); } @@ -4114,7 +4145,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_optimizeLevelStyleCompaction( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_optimizeUniversalStyleCompaction( - JNIEnv*, jobject, jlong jhandle, jlong memtable_memory_budget) { + JNIEnv*, jclass, jlong jhandle, jlong memtable_memory_budget) { reinterpret_cast(jhandle) ->OptimizeUniversalStyleCompaction(memtable_memory_budget); } @@ -4125,7 +4156,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_optimizeUniversalStyleCompaction( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI( - JNIEnv*, jobject, jlong jhandle, jint builtinComparator) { + JNIEnv*, jclass, jlong jhandle, jint builtinComparator) { switch (builtinComparator) { case 1: reinterpret_cast(jhandle) @@ -4144,7 +4175,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI( * Signature: (JJB)V */ void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JJB( - JNIEnv*, jobject, jlong jopt_handle, jlong jcomparator_handle, + JNIEnv*, jclass, jlong jopt_handle, jlong jcomparator_handle, jbyte jcomparator_type) { ROCKSDB_NAMESPACE::Comparator* comparator = nullptr; switch (jcomparator_type) { @@ -4171,7 +4202,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JJB( * Signature: (JJjava/lang/String)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperatorName( - JNIEnv* env, jobject, jlong jhandle, jstring jop_name) { + JNIEnv* env, jclass, jlong jhandle, jstring jop_name) { auto* options = reinterpret_cast(jhandle); const char* op_name = env->GetStringUTFChars(jop_name, nullptr); @@ -4191,7 +4222,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperatorName( * Signature: (JJjava/lang/String)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperator( - JNIEnv*, jobject, jlong jhandle, jlong mergeOperatorHandle) { + JNIEnv*, jclass, jlong jhandle, jlong mergeOperatorHandle) { reinterpret_cast(jhandle) ->merge_operator = *(reinterpret_cast*>( @@ -4204,7 +4235,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperator( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionFilterHandle( - JNIEnv*, jobject, jlong jopt_handle, jlong jcompactionfilter_handle) { + JNIEnv*, jclass, jlong jopt_handle, jlong jcompactionfilter_handle) { reinterpret_cast(jopt_handle) ->compaction_filter = reinterpret_cast( @@ -4217,8 +4248,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionFilterHandle( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionFilterFactoryHandle( - JNIEnv*, jobject, jlong jopt_handle, - jlong jcompactionfilterfactory_handle) { + JNIEnv*, jclass, jlong jopt_handle, jlong jcompactionfilterfactory_handle) { auto* cff_factory = reinterpret_cast< std::shared_ptr*>( jcompactionfilterfactory_handle); @@ -4232,7 +4262,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionFilterFactoryHandle( * Signature: (JJ)I */ void Java_org_rocksdb_ColumnFamilyOptions_setWriteBufferSize( - JNIEnv* env, jobject, jlong jhandle, jlong jwrite_buffer_size) { + JNIEnv* env, jclass, jlong jhandle, jlong jwrite_buffer_size) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( jwrite_buffer_size); if (s.ok()) { @@ -4248,7 +4278,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setWriteBufferSize( * Method: writeBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_writeBufferSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_ColumnFamilyOptions_writeBufferSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->write_buffer_size; @@ -4260,7 +4290,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_writeBufferSize(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumber( - JNIEnv*, jobject, jlong jhandle, jint jmax_write_buffer_number) { + JNIEnv*, jclass, jlong jhandle, jint jmax_write_buffer_number) { reinterpret_cast(jhandle) ->max_write_buffer_number = jmax_write_buffer_number; } @@ -4270,7 +4300,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumber( * Method: maxWriteBufferNumber * Signature: (J)I */ -jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumber(JNIEnv*, jobject, +jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumber(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_write_buffer_number; @@ -4281,7 +4311,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumber(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMemTableFactory( - JNIEnv*, jobject, jlong jhandle, jlong jfactory_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jfactory_handle) { reinterpret_cast(jhandle) ->memtable_factory.reset( reinterpret_cast( @@ -4294,7 +4324,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMemTableFactory( * Signature: (J)Ljava/lang/String */ jstring Java_org_rocksdb_ColumnFamilyOptions_memTableFactoryName( - JNIEnv* env, jobject, jlong jhandle) { + JNIEnv* env, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::MemTableRepFactory* tf = opt->memtable_factory.get(); @@ -4316,7 +4346,7 @@ jstring Java_org_rocksdb_ColumnFamilyOptions_memTableFactoryName( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_useFixedLengthPrefixExtractor( - JNIEnv*, jobject, jlong jhandle, jint jprefix_length) { + JNIEnv*, jclass, jlong jhandle, jint jprefix_length) { reinterpret_cast(jhandle) ->prefix_extractor.reset(ROCKSDB_NAMESPACE::NewFixedPrefixTransform( static_cast(jprefix_length))); @@ -4327,7 +4357,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_useFixedLengthPrefixExtractor( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_useCappedPrefixExtractor( - JNIEnv*, jobject, jlong jhandle, jint jprefix_length) { + JNIEnv*, jclass, jlong jhandle, jint jprefix_length) { reinterpret_cast(jhandle) ->prefix_extractor.reset(ROCKSDB_NAMESPACE::NewCappedPrefixTransform( static_cast(jprefix_length))); @@ -4338,7 +4368,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_useCappedPrefixExtractor( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setTableFactory( - JNIEnv*, jobject, jlong jhandle, jlong jfactory_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jfactory_handle) { reinterpret_cast(jhandle) ->table_factory.reset( reinterpret_cast(jfactory_handle)); @@ -4349,7 +4379,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setTableFactory( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setSstPartitionerFactory( - JNIEnv*, jobject, jlong jhandle, jlong factory_handle) { + JNIEnv*, jclass, jlong jhandle, jlong factory_handle) { auto* options = reinterpret_cast(jhandle); auto factory = reinterpret_cast< @@ -4378,7 +4408,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionThreadLimiter( * Signature: (J)Ljava/lang/String */ jstring Java_org_rocksdb_ColumnFamilyOptions_tableFactoryName(JNIEnv* env, - jobject, + jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); @@ -4443,7 +4473,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_cfPaths(JNIEnv* env, jclass, * Signature: (J)I */ jint Java_org_rocksdb_ColumnFamilyOptions_minWriteBufferNumberToMerge( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->min_write_buffer_number_to_merge; } @@ -4454,7 +4484,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_minWriteBufferNumberToMerge( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMinWriteBufferNumberToMerge( - JNIEnv*, jobject, jlong jhandle, jint jmin_write_buffer_number_to_merge) { + JNIEnv*, jclass, jlong jhandle, jint jmin_write_buffer_number_to_merge) { reinterpret_cast(jhandle) ->min_write_buffer_number_to_merge = static_cast(jmin_write_buffer_number_to_merge); @@ -4466,7 +4496,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMinWriteBufferNumberToMerge( * Signature: (J)I */ jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumberToMaintain( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_write_buffer_number_to_maintain; } @@ -4477,8 +4507,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumberToMaintain( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumberToMaintain( - JNIEnv*, jobject, jlong jhandle, - jint jmax_write_buffer_number_to_maintain) { + JNIEnv*, jclass, jlong jhandle, jint jmax_write_buffer_number_to_maintain) { reinterpret_cast(jhandle) ->max_write_buffer_number_to_maintain = static_cast(jmax_write_buffer_number_to_maintain); @@ -4490,7 +4519,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumberToMaintain( * Signature: (JB)V */ void Java_org_rocksdb_ColumnFamilyOptions_setCompressionType( - JNIEnv*, jobject, jlong jhandle, jbyte jcompression_type_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jcompression_type_value) { auto* cf_opts = reinterpret_cast(jhandle); cf_opts->compression = @@ -4503,7 +4532,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompressionType( * Method: compressionType * Signature: (J)B */ -jbyte Java_org_rocksdb_ColumnFamilyOptions_compressionType(JNIEnv*, jobject, +jbyte Java_org_rocksdb_ColumnFamilyOptions_compressionType(JNIEnv*, jclass, jlong jhandle) { auto* cf_opts = reinterpret_cast(jhandle); @@ -4517,7 +4546,7 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_compressionType(JNIEnv*, jobject, * Signature: (J[B)V */ void Java_org_rocksdb_ColumnFamilyOptions_setCompressionPerLevel( - JNIEnv* env, jobject, jlong jhandle, jbyteArray jcompressionLevels) { + JNIEnv* env, jclass, jlong jhandle, jbyteArray jcompressionLevels) { auto* options = reinterpret_cast(jhandle); auto uptr_compression_levels = @@ -4535,7 +4564,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompressionPerLevel( * Signature: (J)[B */ jbyteArray Java_org_rocksdb_ColumnFamilyOptions_compressionPerLevel( - JNIEnv* env, jobject, jlong jhandle) { + JNIEnv* env, jclass, jlong jhandle) { auto* cf_options = reinterpret_cast(jhandle); return rocksdb_compression_list_helper(env, @@ -4548,7 +4577,7 @@ jbyteArray Java_org_rocksdb_ColumnFamilyOptions_compressionPerLevel( * Signature: (JB)V */ void Java_org_rocksdb_ColumnFamilyOptions_setBottommostCompressionType( - JNIEnv*, jobject, jlong jhandle, jbyte jcompression_type_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jcompression_type_value) { auto* cf_options = reinterpret_cast(jhandle); cf_options->bottommost_compression = @@ -4562,7 +4591,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setBottommostCompressionType( * Signature: (J)B */ jbyte Java_org_rocksdb_ColumnFamilyOptions_bottommostCompressionType( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* cf_options = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType( @@ -4574,7 +4603,7 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_bottommostCompressionType( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setBottommostCompressionOptions( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jlong jbottommost_compression_options_handle) { auto* cf_options = reinterpret_cast(jhandle); @@ -4590,7 +4619,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setBottommostCompressionOptions( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setCompressionOptions( - JNIEnv*, jobject, jlong jhandle, jlong jcompression_options_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jcompression_options_handle) { auto* cf_options = reinterpret_cast(jhandle); auto* compression_options = @@ -4605,7 +4634,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompressionOptions( * Signature: (JB)V */ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionStyle( - JNIEnv*, jobject, jlong jhandle, jbyte jcompaction_style) { + JNIEnv*, jclass, jlong jhandle, jbyte jcompaction_style) { auto* cf_options = reinterpret_cast(jhandle); cf_options->compaction_style = @@ -4618,7 +4647,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionStyle( * Method: compactionStyle * Signature: (J)B */ -jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle(JNIEnv*, jobject, +jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle(JNIEnv*, jclass, jlong jhandle) { auto* cf_options = reinterpret_cast(jhandle); @@ -4632,7 +4661,7 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMaxTableFilesSizeFIFO( - JNIEnv*, jobject, jlong jhandle, jlong jmax_table_files_size) { + JNIEnv*, jclass, jlong jhandle, jlong jmax_table_files_size) { reinterpret_cast(jhandle) ->compaction_options_fifo.max_table_files_size = static_cast(jmax_table_files_size); @@ -4644,7 +4673,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxTableFilesSizeFIFO( * Signature: (J)J */ jlong Java_org_rocksdb_ColumnFamilyOptions_maxTableFilesSizeFIFO( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->compaction_options_fifo.max_table_files_size; } @@ -4654,7 +4683,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_maxTableFilesSizeFIFO( * Method: numLevels * Signature: (J)I */ -jint Java_org_rocksdb_ColumnFamilyOptions_numLevels(JNIEnv*, jobject, +jint Java_org_rocksdb_ColumnFamilyOptions_numLevels(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->num_levels; @@ -4665,7 +4694,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_numLevels(JNIEnv*, jobject, * Method: setNumLevels * Signature: (JI)V */ -void Java_org_rocksdb_ColumnFamilyOptions_setNumLevels(JNIEnv*, jobject, +void Java_org_rocksdb_ColumnFamilyOptions_setNumLevels(JNIEnv*, jclass, jlong jhandle, jint jnum_levels) { reinterpret_cast(jhandle) @@ -4678,7 +4707,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setNumLevels(JNIEnv*, jobject, * Signature: (J)I */ jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroFileNumCompactionTrigger( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger; } @@ -4689,7 +4718,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroFileNumCompactionTrigger( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroFileNumCompactionTrigger( - JNIEnv*, jobject, jlong jhandle, jint jlevel0_file_num_compaction_trigger) { + JNIEnv*, jclass, jlong jhandle, jint jlevel0_file_num_compaction_trigger) { reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger = static_cast(jlevel0_file_num_compaction_trigger); @@ -4701,7 +4730,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroFileNumCompactionTrigger( * Signature: (J)I */ jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroSlowdownWritesTrigger( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level0_slowdown_writes_trigger; } @@ -4712,7 +4741,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroSlowdownWritesTrigger( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroSlowdownWritesTrigger( - JNIEnv*, jobject, jlong jhandle, jint jlevel0_slowdown_writes_trigger) { + JNIEnv*, jclass, jlong jhandle, jint jlevel0_slowdown_writes_trigger) { reinterpret_cast(jhandle) ->level0_slowdown_writes_trigger = static_cast(jlevel0_slowdown_writes_trigger); @@ -4724,7 +4753,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroSlowdownWritesTrigger( * Signature: (J)I */ jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroStopWritesTrigger( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level0_stop_writes_trigger; } @@ -4735,7 +4764,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroStopWritesTrigger( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroStopWritesTrigger( - JNIEnv*, jobject, jlong jhandle, jint jlevel0_stop_writes_trigger) { + JNIEnv*, jclass, jlong jhandle, jint jlevel0_stop_writes_trigger) { reinterpret_cast(jhandle) ->level0_stop_writes_trigger = static_cast(jlevel0_stop_writes_trigger); @@ -4746,7 +4775,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroStopWritesTrigger( * Method: targetFileSizeBase * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeBase(JNIEnv*, jobject, +jlong Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeBase(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->target_file_size_base; @@ -4758,7 +4787,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeBase(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeBase( - JNIEnv*, jobject, jlong jhandle, jlong jtarget_file_size_base) { + JNIEnv*, jclass, jlong jhandle, jlong jtarget_file_size_base) { reinterpret_cast(jhandle) ->target_file_size_base = static_cast(jtarget_file_size_base); } @@ -4769,7 +4798,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeBase( * Signature: (J)I */ jint Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeMultiplier( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->target_file_size_multiplier; } @@ -4780,7 +4809,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeMultiplier( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeMultiplier( - JNIEnv*, jobject, jlong jhandle, jint jtarget_file_size_multiplier) { + JNIEnv*, jclass, jlong jhandle, jint jtarget_file_size_multiplier) { reinterpret_cast(jhandle) ->target_file_size_multiplier = static_cast(jtarget_file_size_multiplier); @@ -4791,8 +4820,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeMultiplier( * Method: maxBytesForLevelBase * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelBase(JNIEnv*, - jobject, +jlong Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelBase(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_bytes_for_level_base; @@ -4804,7 +4832,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelBase(JNIEnv*, * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelBase( - JNIEnv*, jobject, jlong jhandle, jlong jmax_bytes_for_level_base) { + JNIEnv*, jclass, jlong jhandle, jlong jmax_bytes_for_level_base) { reinterpret_cast(jhandle) ->max_bytes_for_level_base = static_cast(jmax_bytes_for_level_base); @@ -4816,7 +4844,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelBase( * Signature: (J)Z */ jboolean Java_org_rocksdb_ColumnFamilyOptions_levelCompactionDynamicLevelBytes( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level_compaction_dynamic_level_bytes; } @@ -4827,7 +4855,7 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_levelCompactionDynamicLevelBytes( * Signature: (JZ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setLevelCompactionDynamicLevelBytes( - JNIEnv*, jobject, jlong jhandle, jboolean jenable_dynamic_level_bytes) { + JNIEnv*, jclass, jlong jhandle, jboolean jenable_dynamic_level_bytes) { reinterpret_cast(jhandle) ->level_compaction_dynamic_level_bytes = (jenable_dynamic_level_bytes); } @@ -4838,7 +4866,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevelCompactionDynamicLevelBytes( * Signature: (J)D */ jdouble Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplier( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_bytes_for_level_multiplier; } @@ -4849,7 +4877,7 @@ jdouble Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplier( * Signature: (JD)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplier( - JNIEnv*, jobject, jlong jhandle, jdouble jmax_bytes_for_level_multiplier) { + JNIEnv*, jclass, jlong jhandle, jdouble jmax_bytes_for_level_multiplier) { reinterpret_cast(jhandle) ->max_bytes_for_level_multiplier = static_cast(jmax_bytes_for_level_multiplier); @@ -4860,7 +4888,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplier( * Method: maxCompactionBytes * Signature: (J)I */ -jlong Java_org_rocksdb_ColumnFamilyOptions_maxCompactionBytes(JNIEnv*, jobject, +jlong Java_org_rocksdb_ColumnFamilyOptions_maxCompactionBytes(JNIEnv*, jclass, jlong jhandle) { return static_cast( reinterpret_cast(jhandle) @@ -4873,7 +4901,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_maxCompactionBytes(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMaxCompactionBytes( - JNIEnv*, jobject, jlong jhandle, jlong jmax_compaction_bytes) { + JNIEnv*, jclass, jlong jhandle, jlong jmax_compaction_bytes) { reinterpret_cast(jhandle) ->max_compaction_bytes = static_cast(jmax_compaction_bytes); } @@ -4883,7 +4911,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxCompactionBytes( * Method: arenaBlockSize * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_arenaBlockSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_ColumnFamilyOptions_arenaBlockSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->arena_block_size; @@ -4895,7 +4923,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_arenaBlockSize(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setArenaBlockSize( - JNIEnv* env, jobject, jlong jhandle, jlong jarena_block_size) { + JNIEnv* env, jclass, jlong jhandle, jlong jarena_block_size) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(jarena_block_size); if (s.ok()) { @@ -4912,7 +4940,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setArenaBlockSize( * Signature: (J)Z */ jboolean Java_org_rocksdb_ColumnFamilyOptions_disableAutoCompactions( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->disable_auto_compactions; } @@ -4923,7 +4951,7 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_disableAutoCompactions( * Signature: (JZ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setDisableAutoCompactions( - JNIEnv*, jobject, jlong jhandle, jboolean jdisable_auto_compactions) { + JNIEnv*, jclass, jlong jhandle, jboolean jdisable_auto_compactions) { reinterpret_cast(jhandle) ->disable_auto_compactions = static_cast(jdisable_auto_compactions); } @@ -4934,7 +4962,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setDisableAutoCompactions( * Signature: (J)J */ jlong Java_org_rocksdb_ColumnFamilyOptions_maxSequentialSkipInIterations( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_sequential_skip_in_iterations; } @@ -4945,7 +4973,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_maxSequentialSkipInIterations( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMaxSequentialSkipInIterations( - JNIEnv*, jobject, jlong jhandle, jlong jmax_sequential_skip_in_iterations) { + JNIEnv*, jclass, jlong jhandle, jlong jmax_sequential_skip_in_iterations) { reinterpret_cast(jhandle) ->max_sequential_skip_in_iterations = static_cast(jmax_sequential_skip_in_iterations); @@ -4957,7 +4985,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxSequentialSkipInIterations( * Signature: (J)Z */ jboolean Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateSupport( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->inplace_update_support; } @@ -4968,7 +4996,7 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateSupport( * Signature: (JZ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateSupport( - JNIEnv*, jobject, jlong jhandle, jboolean jinplace_update_support) { + JNIEnv*, jclass, jlong jhandle, jboolean jinplace_update_support) { reinterpret_cast(jhandle) ->inplace_update_support = static_cast(jinplace_update_support); } @@ -4979,7 +5007,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateSupport( * Signature: (J)J */ jlong Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateNumLocks( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->inplace_update_num_locks; } @@ -4990,7 +5018,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateNumLocks( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateNumLocks( - JNIEnv* env, jobject, jlong jhandle, jlong jinplace_update_num_locks) { + JNIEnv* env, jclass, jlong jhandle, jlong jinplace_update_num_locks) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( jinplace_update_num_locks); if (s.ok()) { @@ -5007,7 +5035,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateNumLocks( * Signature: (J)I */ jdouble Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomSizeRatio( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->memtable_prefix_bloom_size_ratio; } @@ -5018,8 +5046,7 @@ jdouble Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomSizeRatio( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomSizeRatio( - JNIEnv*, jobject, jlong jhandle, - jdouble jmemtable_prefix_bloom_size_ratio) { + JNIEnv*, jclass, jlong jhandle, jdouble jmemtable_prefix_bloom_size_ratio) { reinterpret_cast(jhandle) ->memtable_prefix_bloom_size_ratio = static_cast(jmemtable_prefix_bloom_size_ratio); @@ -5031,7 +5058,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomSizeRatio( * Signature: (J)I */ jdouble Java_org_rocksdb_ColumnFamilyOptions_experimentalMempurgeThreshold( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->experimental_mempurge_threshold; } @@ -5042,7 +5069,7 @@ jdouble Java_org_rocksdb_ColumnFamilyOptions_experimentalMempurgeThreshold( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setExperimentalMempurgeThreshold( - JNIEnv*, jobject, jlong jhandle, jdouble jexperimental_mempurge_threshold) { + JNIEnv*, jclass, jlong jhandle, jdouble jexperimental_mempurge_threshold) { reinterpret_cast(jhandle) ->experimental_mempurge_threshold = static_cast(jexperimental_mempurge_threshold); @@ -5054,7 +5081,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setExperimentalMempurgeThreshold( * Signature: (J)Z */ jboolean Java_org_rocksdb_ColumnFamilyOptions_memtableWholeKeyFiltering( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->memtable_whole_key_filtering; } @@ -5065,7 +5092,7 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_memtableWholeKeyFiltering( * Signature: (JZ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMemtableWholeKeyFiltering( - JNIEnv*, jobject, jlong jhandle, jboolean jmemtable_whole_key_filtering) { + JNIEnv*, jclass, jlong jhandle, jboolean jmemtable_whole_key_filtering) { reinterpret_cast(jhandle) ->memtable_whole_key_filtering = static_cast(jmemtable_whole_key_filtering); @@ -5076,7 +5103,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMemtableWholeKeyFiltering( * Method: bloomLocality * Signature: (J)I */ -jint Java_org_rocksdb_ColumnFamilyOptions_bloomLocality(JNIEnv*, jobject, +jint Java_org_rocksdb_ColumnFamilyOptions_bloomLocality(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->bloom_locality; @@ -5088,7 +5115,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_bloomLocality(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setBloomLocality( - JNIEnv*, jobject, jlong jhandle, jint jbloom_locality) { + JNIEnv*, jclass, jlong jhandle, jint jbloom_locality) { reinterpret_cast(jhandle) ->bloom_locality = static_cast(jbloom_locality); } @@ -5098,7 +5125,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setBloomLocality( * Method: maxSuccessiveMerges * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_maxSuccessiveMerges(JNIEnv*, jobject, +jlong Java_org_rocksdb_ColumnFamilyOptions_maxSuccessiveMerges(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_successive_merges; @@ -5110,7 +5137,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_maxSuccessiveMerges(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMaxSuccessiveMerges( - JNIEnv* env, jobject, jlong jhandle, jlong jmax_successive_merges) { + JNIEnv* env, jclass, jlong jhandle, jlong jmax_successive_merges) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( jmax_successive_merges); if (s.ok()) { @@ -5127,7 +5154,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxSuccessiveMerges( * Signature: (J)Z */ jboolean Java_org_rocksdb_ColumnFamilyOptions_optimizeFiltersForHits( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->optimize_filters_for_hits; } @@ -5138,7 +5165,7 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_optimizeFiltersForHits( * Signature: (JZ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setOptimizeFiltersForHits( - JNIEnv*, jobject, jlong jhandle, jboolean joptimize_filters_for_hits) { + JNIEnv*, jclass, jlong jhandle, jboolean joptimize_filters_for_hits) { reinterpret_cast(jhandle) ->optimize_filters_for_hits = static_cast(joptimize_filters_for_hits); @@ -5149,8 +5176,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setOptimizeFiltersForHits( * Method: memtableHugePageSize * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_memtableHugePageSize(JNIEnv*, - jobject, +jlong Java_org_rocksdb_ColumnFamilyOptions_memtableHugePageSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->memtable_huge_page_size; @@ -5162,7 +5188,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_memtableHugePageSize(JNIEnv*, * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMemtableHugePageSize( - JNIEnv* env, jobject, jlong jhandle, jlong jmemtable_huge_page_size) { + JNIEnv* env, jclass, jlong jhandle, jlong jmemtable_huge_page_size) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( jmemtable_huge_page_size); if (s.ok()) { @@ -5179,7 +5205,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMemtableHugePageSize( * Signature: (J)J */ jlong Java_org_rocksdb_ColumnFamilyOptions_softPendingCompactionBytesLimit( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->soft_pending_compaction_bytes_limit; } @@ -5190,7 +5216,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_softPendingCompactionBytesLimit( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setSoftPendingCompactionBytesLimit( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jlong jsoft_pending_compaction_bytes_limit) { reinterpret_cast(jhandle) ->soft_pending_compaction_bytes_limit = @@ -5203,7 +5229,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setSoftPendingCompactionBytesLimit( * Signature: (J)J */ jlong Java_org_rocksdb_ColumnFamilyOptions_hardPendingCompactionBytesLimit( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->hard_pending_compaction_bytes_limit; } @@ -5214,7 +5240,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_hardPendingCompactionBytesLimit( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setHardPendingCompactionBytesLimit( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jlong jhard_pending_compaction_bytes_limit) { reinterpret_cast(jhandle) ->hard_pending_compaction_bytes_limit = @@ -5227,7 +5253,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setHardPendingCompactionBytesLimit( * Signature: (J)I */ jint Java_org_rocksdb_ColumnFamilyOptions_level0FileNumCompactionTrigger( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger; } @@ -5238,7 +5264,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_level0FileNumCompactionTrigger( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setLevel0FileNumCompactionTrigger( - JNIEnv*, jobject, jlong jhandle, jint jlevel0_file_num_compaction_trigger) { + JNIEnv*, jclass, jlong jhandle, jint jlevel0_file_num_compaction_trigger) { reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger = static_cast(jlevel0_file_num_compaction_trigger); @@ -5250,7 +5276,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevel0FileNumCompactionTrigger( * Signature: (J)I */ jint Java_org_rocksdb_ColumnFamilyOptions_level0SlowdownWritesTrigger( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level0_slowdown_writes_trigger; } @@ -5261,7 +5287,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_level0SlowdownWritesTrigger( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setLevel0SlowdownWritesTrigger( - JNIEnv*, jobject, jlong jhandle, jint jlevel0_slowdown_writes_trigger) { + JNIEnv*, jclass, jlong jhandle, jint jlevel0_slowdown_writes_trigger) { reinterpret_cast(jhandle) ->level0_slowdown_writes_trigger = static_cast(jlevel0_slowdown_writes_trigger); @@ -5273,7 +5299,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevel0SlowdownWritesTrigger( * Signature: (J)I */ jint Java_org_rocksdb_ColumnFamilyOptions_level0StopWritesTrigger( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->level0_stop_writes_trigger; } @@ -5284,7 +5310,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_level0StopWritesTrigger( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setLevel0StopWritesTrigger( - JNIEnv*, jobject, jlong jhandle, jint jlevel0_stop_writes_trigger) { + JNIEnv*, jclass, jlong jhandle, jint jlevel0_stop_writes_trigger) { reinterpret_cast(jhandle) ->level0_stop_writes_trigger = static_cast(jlevel0_stop_writes_trigger); @@ -5297,7 +5323,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevel0StopWritesTrigger( */ jintArray Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplierAdditional( - JNIEnv* env, jobject, jlong jhandle) { + JNIEnv* env, jclass, jlong jhandle) { auto mbflma = reinterpret_cast(jhandle) ->max_bytes_for_level_multiplier_additional; @@ -5335,7 +5361,7 @@ Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplierAdditional( * Signature: (J[I)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplierAdditional( - JNIEnv* env, jobject, jlong jhandle, + JNIEnv* env, jclass, jlong jhandle, jintArray jmax_bytes_for_level_multiplier_additional) { jsize len = env->GetArrayLength(jmax_bytes_for_level_multiplier_additional); jint* additionals = env->GetIntArrayElements( @@ -5363,7 +5389,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplierAdditiona * Signature: (J)Z */ jboolean Java_org_rocksdb_ColumnFamilyOptions_paranoidFileChecks( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->paranoid_file_checks; } @@ -5374,7 +5400,7 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_paranoidFileChecks( * Signature: (JZ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setParanoidFileChecks( - JNIEnv*, jobject, jlong jhandle, jboolean jparanoid_file_checks) { + JNIEnv*, jclass, jlong jhandle, jboolean jparanoid_file_checks) { reinterpret_cast(jhandle) ->paranoid_file_checks = static_cast(jparanoid_file_checks); } @@ -5385,7 +5411,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setParanoidFileChecks( * Signature: (JB)V */ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionPriority( - JNIEnv*, jobject, jlong jhandle, jbyte jcompaction_priority_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jcompaction_priority_value) { auto* cf_opts = reinterpret_cast(jhandle); cf_opts->compaction_pri = @@ -5398,7 +5424,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionPriority( * Method: compactionPriority * Signature: (J)B */ -jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionPriority(JNIEnv*, jobject, +jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionPriority(JNIEnv*, jclass, jlong jhandle) { auto* cf_opts = reinterpret_cast(jhandle); @@ -5412,7 +5438,7 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionPriority(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setReportBgIoStats( - JNIEnv*, jobject, jlong jhandle, jboolean jreport_bg_io_stats) { + JNIEnv*, jclass, jlong jhandle, jboolean jreport_bg_io_stats) { auto* cf_opts = reinterpret_cast(jhandle); cf_opts->report_bg_io_stats = static_cast(jreport_bg_io_stats); @@ -5423,7 +5449,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setReportBgIoStats( * Method: reportBgIoStats * Signature: (J)Z */ -jboolean Java_org_rocksdb_ColumnFamilyOptions_reportBgIoStats(JNIEnv*, jobject, +jboolean Java_org_rocksdb_ColumnFamilyOptions_reportBgIoStats(JNIEnv*, jclass, jlong jhandle) { auto* cf_opts = reinterpret_cast(jhandle); @@ -5435,8 +5461,8 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_reportBgIoStats(JNIEnv*, jobject, * Method: setTtl * Signature: (JJ)V */ -void Java_org_rocksdb_ColumnFamilyOptions_setTtl(JNIEnv*, jobject, - jlong jhandle, jlong jttl) { +void Java_org_rocksdb_ColumnFamilyOptions_setTtl(JNIEnv*, jclass, jlong jhandle, + jlong jttl) { auto* cf_opts = reinterpret_cast(jhandle); cf_opts->ttl = static_cast(jttl); @@ -5448,7 +5474,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setTtl(JNIEnv*, jobject, * Signature: (J)J */ JNIEXPORT jlong JNICALL -Java_org_rocksdb_ColumnFamilyOptions_ttl(JNIEnv*, jobject, jlong jhandle) { +Java_org_rocksdb_ColumnFamilyOptions_ttl(JNIEnv*, jclass, jlong jhandle) { auto* cf_opts = reinterpret_cast(jhandle); return static_cast(cf_opts->ttl); @@ -5460,7 +5486,7 @@ Java_org_rocksdb_ColumnFamilyOptions_ttl(JNIEnv*, jobject, jlong jhandle) { * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setPeriodicCompactionSeconds( - JNIEnv*, jobject, jlong jhandle, jlong jperiodicCompactionSeconds) { + JNIEnv*, jclass, jlong jhandle, jlong jperiodicCompactionSeconds) { auto* cf_opts = reinterpret_cast(jhandle); cf_opts->periodic_compaction_seconds = @@ -5473,7 +5499,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setPeriodicCompactionSeconds( * Signature: (J)J */ JNIEXPORT jlong JNICALL -Java_org_rocksdb_ColumnFamilyOptions_periodicCompactionSeconds(JNIEnv*, jobject, +Java_org_rocksdb_ColumnFamilyOptions_periodicCompactionSeconds(JNIEnv*, jclass, jlong jhandle) { auto* cf_opts = reinterpret_cast(jhandle); @@ -5486,7 +5512,7 @@ Java_org_rocksdb_ColumnFamilyOptions_periodicCompactionSeconds(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionOptionsUniversal( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jlong jcompaction_options_universal_handle) { auto* cf_opts = reinterpret_cast(jhandle); @@ -5502,7 +5528,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionOptionsUniversal( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionOptionsFIFO( - JNIEnv*, jobject, jlong jhandle, jlong jcompaction_options_fifo_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jcompaction_options_fifo_handle) { auto* cf_opts = reinterpret_cast(jhandle); auto* opts_fifo = reinterpret_cast( @@ -5516,7 +5542,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionOptionsFIFO( * Signature: (JZ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setForceConsistencyChecks( - JNIEnv*, jobject, jlong jhandle, jboolean jforce_consistency_checks) { + JNIEnv*, jclass, jlong jhandle, jboolean jforce_consistency_checks) { auto* cf_opts = reinterpret_cast(jhandle); cf_opts->force_consistency_checks = @@ -5529,7 +5555,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setForceConsistencyChecks( * Signature: (J)Z */ jboolean Java_org_rocksdb_ColumnFamilyOptions_forceConsistencyChecks( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* cf_opts = reinterpret_cast(jhandle); return static_cast(cf_opts->force_consistency_checks); @@ -5543,7 +5569,7 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_forceConsistencyChecks( * Signature: (JZ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setEnableBlobFiles( - JNIEnv*, jobject, jlong jhandle, jboolean jenable_blob_files) { + JNIEnv*, jclass, jlong jhandle, jboolean jenable_blob_files) { auto* opts = reinterpret_cast(jhandle); opts->enable_blob_files = static_cast(jenable_blob_files); @@ -5554,7 +5580,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setEnableBlobFiles( * Method: enableBlobFiles * Signature: (J)Z */ -jboolean Java_org_rocksdb_ColumnFamilyOptions_enableBlobFiles(JNIEnv*, jobject, +jboolean Java_org_rocksdb_ColumnFamilyOptions_enableBlobFiles(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -5566,7 +5592,7 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_enableBlobFiles(JNIEnv*, jobject, * Method: setMinBlobSize * Signature: (JJ)V */ -void Java_org_rocksdb_ColumnFamilyOptions_setMinBlobSize(JNIEnv*, jobject, +void Java_org_rocksdb_ColumnFamilyOptions_setMinBlobSize(JNIEnv*, jclass, jlong jhandle, jlong jmin_blob_size) { auto* opts = @@ -5579,7 +5605,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMinBlobSize(JNIEnv*, jobject, * Method: minBlobSize * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_minBlobSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_ColumnFamilyOptions_minBlobSize(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -5592,7 +5618,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_minBlobSize(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setBlobFileSize( - JNIEnv*, jobject, jlong jhandle, jlong jblob_file_size) { + JNIEnv*, jclass, jlong jhandle, jlong jblob_file_size) { auto* opts = reinterpret_cast(jhandle); opts->blob_file_size = static_cast(jblob_file_size); @@ -5603,7 +5629,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setBlobFileSize( * Method: blobFileSize * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_blobFileSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_ColumnFamilyOptions_blobFileSize(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -5616,7 +5642,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_blobFileSize(JNIEnv*, jobject, * Signature: (JB)V */ void Java_org_rocksdb_ColumnFamilyOptions_setBlobCompressionType( - JNIEnv*, jobject, jlong jhandle, jbyte jblob_compression_type_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jblob_compression_type_value) { auto* opts = reinterpret_cast(jhandle); opts->blob_compression_type = @@ -5629,7 +5655,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setBlobCompressionType( * Method: blobCompressionType * Signature: (J)B */ -jbyte Java_org_rocksdb_ColumnFamilyOptions_blobCompressionType(JNIEnv*, jobject, +jbyte Java_org_rocksdb_ColumnFamilyOptions_blobCompressionType(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -5643,7 +5669,7 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_blobCompressionType(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setEnableBlobGarbageCollection( - JNIEnv*, jobject, jlong jhandle, jboolean jenable_blob_garbage_collection) { + JNIEnv*, jclass, jlong jhandle, jboolean jenable_blob_garbage_collection) { auto* opts = reinterpret_cast(jhandle); opts->enable_blob_garbage_collection = @@ -5656,7 +5682,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setEnableBlobGarbageCollection( * Signature: (J)Z */ jboolean Java_org_rocksdb_ColumnFamilyOptions_enableBlobGarbageCollection( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->enable_blob_garbage_collection); @@ -5668,7 +5694,7 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_enableBlobGarbageCollection( * Signature: (JD)V */ void Java_org_rocksdb_ColumnFamilyOptions_setBlobGarbageCollectionAgeCutoff( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jdouble jblob_garbage_collection_age_cutoff) { auto* opts = reinterpret_cast(jhandle); @@ -5682,7 +5708,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setBlobGarbageCollectionAgeCutoff( * Signature: (J)D */ jdouble Java_org_rocksdb_ColumnFamilyOptions_blobGarbageCollectionAgeCutoff( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->blob_garbage_collection_age_cutoff); @@ -5694,7 +5720,7 @@ jdouble Java_org_rocksdb_ColumnFamilyOptions_blobGarbageCollectionAgeCutoff( * Signature: (JD)V */ void Java_org_rocksdb_ColumnFamilyOptions_setBlobGarbageCollectionForceThreshold( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jdouble jblob_garbage_collection_force_threshold) { auto* opts = reinterpret_cast(jhandle); @@ -5709,7 +5735,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setBlobGarbageCollectionForceThreshold */ jdouble Java_org_rocksdb_ColumnFamilyOptions_blobGarbageCollectionForceThreshold( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->blob_garbage_collection_force_threshold); @@ -5721,7 +5747,7 @@ Java_org_rocksdb_ColumnFamilyOptions_blobGarbageCollectionForceThreshold( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setBlobCompactionReadaheadSize( - JNIEnv*, jobject, jlong jhandle, jlong jblob_compaction_readahead_size) { + JNIEnv*, jclass, jlong jhandle, jlong jblob_compaction_readahead_size) { auto* opts = reinterpret_cast(jhandle); opts->blob_compaction_readahead_size = @@ -5734,7 +5760,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setBlobCompactionReadaheadSize( * Signature: (J)J */ jlong Java_org_rocksdb_ColumnFamilyOptions_blobCompactionReadaheadSize( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->blob_compaction_readahead_size); @@ -5746,7 +5772,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_blobCompactionReadaheadSize( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setBlobFileStartingLevel( - JNIEnv*, jobject, jlong jhandle, jint jblob_file_starting_level) { + JNIEnv*, jclass, jlong jhandle, jint jblob_file_starting_level) { auto* opts = reinterpret_cast(jhandle); opts->blob_file_starting_level = jblob_file_starting_level; @@ -5757,8 +5783,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setBlobFileStartingLevel( * Method: blobFileStartingLevel * Signature: (J)I */ -jint Java_org_rocksdb_ColumnFamilyOptions_blobFileStartingLevel(JNIEnv*, - jobject, +jint Java_org_rocksdb_ColumnFamilyOptions_blobFileStartingLevel(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -5771,7 +5796,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_blobFileStartingLevel(JNIEnv*, * Signature: (JB)V */ void Java_org_rocksdb_ColumnFamilyOptions_setPrepopulateBlobCache( - JNIEnv*, jobject, jlong jhandle, jbyte jprepopulate_blob_cache_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jprepopulate_blob_cache_value) { auto* opts = reinterpret_cast(jhandle); opts->prepopulate_blob_cache = @@ -5784,8 +5809,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setPrepopulateBlobCache( * Method: prepopulateBlobCache * Signature: (J)B */ -jbyte Java_org_rocksdb_ColumnFamilyOptions_prepopulateBlobCache(JNIEnv*, - jobject, +jbyte Java_org_rocksdb_ColumnFamilyOptions_prepopulateBlobCache(JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -5799,7 +5823,7 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_prepopulateBlobCache(JNIEnv*, * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMemtableMaxRangeDeletions( - JNIEnv*, jobject, jlong jhandle, jint jmemtable_max_range_deletions) { + JNIEnv*, jclass, jlong jhandle, jint jmemtable_max_range_deletions) { auto* opts = reinterpret_cast(jhandle); opts->memtable_max_range_deletions = jmemtable_max_range_deletions; @@ -5811,7 +5835,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMemtableMaxRangeDeletions( * Signature: (J)I */ jint Java_org_rocksdb_ColumnFamilyOptions_memtableMaxRangeDeletions( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->memtable_max_range_deletions); @@ -5926,8 +5950,8 @@ jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps__Ljava_lang_String_2( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_DBOptions_disposeInternal(JNIEnv*, jobject, - jlong handle) { +void Java_org_rocksdb_DBOptions_disposeInternalJni(JNIEnv*, jclass, + jlong handle) { auto* dbo = reinterpret_cast(handle); assert(dbo != nullptr); delete dbo; @@ -5938,7 +5962,7 @@ void Java_org_rocksdb_DBOptions_disposeInternal(JNIEnv*, jobject, * Method: optimizeForSmallDb * Signature: (J)V */ -void Java_org_rocksdb_DBOptions_optimizeForSmallDb(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_optimizeForSmallDb(JNIEnv*, jclass, jlong jhandle) { reinterpret_cast(jhandle) ->OptimizeForSmallDb(); @@ -5949,7 +5973,7 @@ void Java_org_rocksdb_DBOptions_optimizeForSmallDb(JNIEnv*, jobject, * Method: setEnv * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setEnv(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setEnv(JNIEnv*, jclass, jlong jhandle, jlong jenv_handle) { reinterpret_cast(jhandle)->env = reinterpret_cast(jenv_handle); @@ -5960,7 +5984,7 @@ void Java_org_rocksdb_DBOptions_setEnv(JNIEnv*, jobject, jlong jhandle, * Method: setIncreaseParallelism * Signature: (JI)V */ -void Java_org_rocksdb_DBOptions_setIncreaseParallelism(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setIncreaseParallelism(JNIEnv*, jclass, jlong jhandle, jint totalThreads) { reinterpret_cast(jhandle)->IncreaseParallelism( @@ -5972,7 +5996,7 @@ void Java_org_rocksdb_DBOptions_setIncreaseParallelism(JNIEnv*, jobject, * Method: setCreateIfMissing * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setCreateIfMissing(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setCreateIfMissing(JNIEnv*, jclass, jlong jhandle, jboolean flag) { reinterpret_cast(jhandle)->create_if_missing = @@ -5984,7 +6008,7 @@ void Java_org_rocksdb_DBOptions_setCreateIfMissing(JNIEnv*, jobject, * Method: createIfMissing * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_createIfMissing(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_createIfMissing(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->create_if_missing; @@ -5995,7 +6019,7 @@ jboolean Java_org_rocksdb_DBOptions_createIfMissing(JNIEnv*, jobject, * Method: setCreateMissingColumnFamilies * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setCreateMissingColumnFamilies(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setCreateMissingColumnFamilies(JNIEnv*, jclass, jlong jhandle, jboolean flag) { reinterpret_cast(jhandle) @@ -6007,8 +6031,7 @@ void Java_org_rocksdb_DBOptions_setCreateMissingColumnFamilies(JNIEnv*, jobject, * Method: createMissingColumnFamilies * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_createMissingColumnFamilies(JNIEnv*, - jobject, +jboolean Java_org_rocksdb_DBOptions_createMissingColumnFamilies(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->create_missing_column_families; @@ -6019,8 +6042,7 @@ jboolean Java_org_rocksdb_DBOptions_createMissingColumnFamilies(JNIEnv*, * Method: setErrorIfExists * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setErrorIfExists(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_DBOptions_setErrorIfExists(JNIEnv*, jclass, jlong jhandle, jboolean error_if_exists) { reinterpret_cast(jhandle)->error_if_exists = static_cast(error_if_exists); @@ -6031,7 +6053,7 @@ void Java_org_rocksdb_DBOptions_setErrorIfExists(JNIEnv*, jobject, * Method: errorIfExists * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_errorIfExists(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_errorIfExists(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->error_if_exists; @@ -6042,7 +6064,7 @@ jboolean Java_org_rocksdb_DBOptions_errorIfExists(JNIEnv*, jobject, * Method: setParanoidChecks * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setParanoidChecks(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setParanoidChecks(JNIEnv*, jclass, jlong jhandle, jboolean paranoid_checks) { reinterpret_cast(jhandle)->paranoid_checks = @@ -6054,7 +6076,7 @@ void Java_org_rocksdb_DBOptions_setParanoidChecks(JNIEnv*, jobject, * Method: paranoidChecks * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_paranoidChecks(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_paranoidChecks(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->paranoid_checks; @@ -6065,7 +6087,7 @@ jboolean Java_org_rocksdb_DBOptions_paranoidChecks(JNIEnv*, jobject, * Method: setRateLimiter * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setRateLimiter(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setRateLimiter(JNIEnv*, jclass, jlong jhandle, jlong jrate_limiter_handle) { std::shared_ptr* pRateLimiter = reinterpret_cast*>( @@ -6080,7 +6102,7 @@ void Java_org_rocksdb_DBOptions_setRateLimiter(JNIEnv*, jobject, jlong jhandle, * Signature: (JJ)V */ void Java_org_rocksdb_DBOptions_setSstFileManager( - JNIEnv*, jobject, jlong jhandle, jlong jsst_file_manager_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jsst_file_manager_handle) { auto* sptr_sst_file_manager = reinterpret_cast*>( jsst_file_manager_handle); @@ -6091,14 +6113,31 @@ void Java_org_rocksdb_DBOptions_setSstFileManager( /* * Class: org_rocksdb_DBOptions * Method: setLogger - * Signature: (JJ)V + * Signature: (JJB)V */ -void Java_org_rocksdb_DBOptions_setLogger(JNIEnv*, jobject, jlong jhandle, - jlong jlogger_handle) { - std::shared_ptr* pLogger = - reinterpret_cast*>( - jlogger_handle); - reinterpret_cast(jhandle)->info_log = *pLogger; +void Java_org_rocksdb_DBOptions_setLogger(JNIEnv* env, jclass, jlong jhandle, + jlong jlogger_handle, + jbyte jlogger_type) { + auto* options = reinterpret_cast(jhandle); + switch (jlogger_type) { + case 0x1: + // JAVA_IMPLEMENTATION + options->info_log = + *(reinterpret_cast< + std::shared_ptr*>( + jlogger_handle)); + break; + case 0x2: + // STDERR_IMPLEMENTATION + options->info_log = + *(reinterpret_cast*>( + jlogger_handle)); + break; + default: + ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew( + env, ROCKSDB_NAMESPACE::Status::InvalidArgument( + ROCKSDB_NAMESPACE::Slice("Unknown value for LoggerType"))); + } } /* @@ -6106,7 +6145,7 @@ void Java_org_rocksdb_DBOptions_setLogger(JNIEnv*, jobject, jlong jhandle, * Method: setInfoLogLevel * Signature: (JB)V */ -void Java_org_rocksdb_DBOptions_setInfoLogLevel(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setInfoLogLevel(JNIEnv*, jclass, jlong jhandle, jbyte jlog_level) { reinterpret_cast(jhandle)->info_log_level = static_cast(jlog_level); @@ -6117,7 +6156,7 @@ void Java_org_rocksdb_DBOptions_setInfoLogLevel(JNIEnv*, jobject, jlong jhandle, * Method: infoLogLevel * Signature: (J)B */ -jbyte Java_org_rocksdb_DBOptions_infoLogLevel(JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_DBOptions_infoLogLevel(JNIEnv*, jclass, jlong jhandle) { return static_cast( reinterpret_cast(jhandle)->info_log_level); } @@ -6127,7 +6166,7 @@ jbyte Java_org_rocksdb_DBOptions_infoLogLevel(JNIEnv*, jobject, jlong jhandle) { * Method: setMaxTotalWalSize * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setMaxTotalWalSize(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setMaxTotalWalSize(JNIEnv*, jclass, jlong jhandle, jlong jmax_total_wal_size) { reinterpret_cast(jhandle)->max_total_wal_size = @@ -6139,7 +6178,7 @@ void Java_org_rocksdb_DBOptions_setMaxTotalWalSize(JNIEnv*, jobject, * Method: maxTotalWalSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_maxTotalWalSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_maxTotalWalSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_total_wal_size; @@ -6150,7 +6189,7 @@ jlong Java_org_rocksdb_DBOptions_maxTotalWalSize(JNIEnv*, jobject, * Method: setMaxOpenFiles * Signature: (JI)V */ -void Java_org_rocksdb_DBOptions_setMaxOpenFiles(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setMaxOpenFiles(JNIEnv*, jclass, jlong jhandle, jint max_open_files) { reinterpret_cast(jhandle)->max_open_files = static_cast(max_open_files); @@ -6161,7 +6200,7 @@ void Java_org_rocksdb_DBOptions_setMaxOpenFiles(JNIEnv*, jobject, jlong jhandle, * Method: maxOpenFiles * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_maxOpenFiles(JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_DBOptions_maxOpenFiles(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_open_files; } @@ -6172,7 +6211,7 @@ jint Java_org_rocksdb_DBOptions_maxOpenFiles(JNIEnv*, jobject, jlong jhandle) { * Signature: (JI)V */ void Java_org_rocksdb_DBOptions_setMaxFileOpeningThreads( - JNIEnv*, jobject, jlong jhandle, jint jmax_file_opening_threads) { + JNIEnv*, jclass, jlong jhandle, jint jmax_file_opening_threads) { reinterpret_cast(jhandle) ->max_file_opening_threads = static_cast(jmax_file_opening_threads); } @@ -6182,7 +6221,7 @@ void Java_org_rocksdb_DBOptions_setMaxFileOpeningThreads( * Method: maxFileOpeningThreads * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_maxFileOpeningThreads(JNIEnv*, jobject, +jint Java_org_rocksdb_DBOptions_maxFileOpeningThreads(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->max_file_opening_threads); @@ -6193,7 +6232,7 @@ jint Java_org_rocksdb_DBOptions_maxFileOpeningThreads(JNIEnv*, jobject, * Method: setStatistics * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setStatistics(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setStatistics(JNIEnv*, jclass, jlong jhandle, jlong jstatistics_handle) { auto* opt = reinterpret_cast(jhandle); auto* pSptr = @@ -6207,7 +6246,7 @@ void Java_org_rocksdb_DBOptions_setStatistics(JNIEnv*, jobject, jlong jhandle, * Method: statistics * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_statistics(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_statistics(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); std::shared_ptr sptr = opt->statistics; if (sptr == nullptr) { @@ -6224,7 +6263,7 @@ jlong Java_org_rocksdb_DBOptions_statistics(JNIEnv*, jobject, jlong jhandle) { * Method: setUseFsync * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setUseFsync(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setUseFsync(JNIEnv*, jclass, jlong jhandle, jboolean use_fsync) { reinterpret_cast(jhandle)->use_fsync = static_cast(use_fsync); @@ -6235,7 +6274,7 @@ void Java_org_rocksdb_DBOptions_setUseFsync(JNIEnv*, jobject, jlong jhandle, * Method: useFsync * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_useFsync(JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_useFsync(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->use_fsync; } @@ -6244,7 +6283,7 @@ jboolean Java_org_rocksdb_DBOptions_useFsync(JNIEnv*, jobject, jlong jhandle) { * Method: setDbPaths * Signature: (J[Ljava/lang/String;[J)V */ -void Java_org_rocksdb_DBOptions_setDbPaths(JNIEnv* env, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setDbPaths(JNIEnv* env, jclass, jlong jhandle, jobjectArray jpaths, jlongArray jtarget_sizes) { std::vector db_paths; @@ -6290,7 +6329,7 @@ void Java_org_rocksdb_DBOptions_setDbPaths(JNIEnv* env, jobject, jlong jhandle, * Method: dbPathsLen * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_dbPathsLen(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_dbPathsLen(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->db_paths.size()); } @@ -6300,7 +6339,7 @@ jlong Java_org_rocksdb_DBOptions_dbPathsLen(JNIEnv*, jobject, jlong jhandle) { * Method: dbPaths * Signature: (J[Ljava/lang/String;[J)V */ -void Java_org_rocksdb_DBOptions_dbPaths(JNIEnv* env, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_dbPaths(JNIEnv* env, jclass, jlong jhandle, jobjectArray jpaths, jlongArray jtarget_sizes) { jboolean is_copy; @@ -6341,7 +6380,7 @@ void Java_org_rocksdb_DBOptions_dbPaths(JNIEnv* env, jobject, jlong jhandle, * Method: setDbLogDir * Signature: (JLjava/lang/String)V */ -void Java_org_rocksdb_DBOptions_setDbLogDir(JNIEnv* env, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setDbLogDir(JNIEnv* env, jclass, jlong jhandle, jstring jdb_log_dir) { const char* log_dir = env->GetStringUTFChars(jdb_log_dir, nullptr); if (log_dir == nullptr) { @@ -6359,7 +6398,7 @@ void Java_org_rocksdb_DBOptions_setDbLogDir(JNIEnv* env, jobject, jlong jhandle, * Method: dbLogDir * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_DBOptions_dbLogDir(JNIEnv* env, jobject, +jstring Java_org_rocksdb_DBOptions_dbLogDir(JNIEnv* env, jclass, jlong jhandle) { return env->NewStringUTF( reinterpret_cast(jhandle) @@ -6371,7 +6410,7 @@ jstring Java_org_rocksdb_DBOptions_dbLogDir(JNIEnv* env, jobject, * Method: setWalDir * Signature: (JLjava/lang/String)V */ -void Java_org_rocksdb_DBOptions_setWalDir(JNIEnv* env, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setWalDir(JNIEnv* env, jclass, jlong jhandle, jstring jwal_dir) { const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0); reinterpret_cast(jhandle)->wal_dir.assign( @@ -6384,7 +6423,7 @@ void Java_org_rocksdb_DBOptions_setWalDir(JNIEnv* env, jobject, jlong jhandle, * Method: walDir * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_DBOptions_walDir(JNIEnv* env, jobject, jlong jhandle) { +jstring Java_org_rocksdb_DBOptions_walDir(JNIEnv* env, jclass, jlong jhandle) { return env->NewStringUTF( reinterpret_cast(jhandle) ->wal_dir.c_str()); @@ -6396,7 +6435,7 @@ jstring Java_org_rocksdb_DBOptions_walDir(JNIEnv* env, jobject, jlong jhandle) { * Signature: (JJ)V */ void Java_org_rocksdb_DBOptions_setDeleteObsoleteFilesPeriodMicros( - JNIEnv*, jobject, jlong jhandle, jlong micros) { + JNIEnv*, jclass, jlong jhandle, jlong micros) { reinterpret_cast(jhandle) ->delete_obsolete_files_period_micros = static_cast(micros); } @@ -6407,7 +6446,7 @@ void Java_org_rocksdb_DBOptions_setDeleteObsoleteFilesPeriodMicros( * Signature: (J)J */ jlong Java_org_rocksdb_DBOptions_deleteObsoleteFilesPeriodMicros( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->delete_obsolete_files_period_micros; } @@ -6417,7 +6456,7 @@ jlong Java_org_rocksdb_DBOptions_deleteObsoleteFilesPeriodMicros( * Method: setMaxBackgroundCompactions * Signature: (JI)V */ -void Java_org_rocksdb_DBOptions_setMaxBackgroundCompactions(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setMaxBackgroundCompactions(JNIEnv*, jclass, jlong jhandle, jint max) { reinterpret_cast(jhandle) @@ -6429,7 +6468,7 @@ void Java_org_rocksdb_DBOptions_setMaxBackgroundCompactions(JNIEnv*, jobject, * Method: maxBackgroundCompactions * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_maxBackgroundCompactions(JNIEnv*, jobject, +jint Java_org_rocksdb_DBOptions_maxBackgroundCompactions(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_background_compactions; @@ -6440,7 +6479,7 @@ jint Java_org_rocksdb_DBOptions_maxBackgroundCompactions(JNIEnv*, jobject, * Method: setMaxSubcompactions * Signature: (JI)V */ -void Java_org_rocksdb_DBOptions_setMaxSubcompactions(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setMaxSubcompactions(JNIEnv*, jclass, jlong jhandle, jint max) { reinterpret_cast(jhandle)->max_subcompactions = static_cast(max); @@ -6451,7 +6490,7 @@ void Java_org_rocksdb_DBOptions_setMaxSubcompactions(JNIEnv*, jobject, * Method: maxSubcompactions * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_maxSubcompactions(JNIEnv*, jobject, +jint Java_org_rocksdb_DBOptions_maxSubcompactions(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_subcompactions; @@ -6463,7 +6502,7 @@ jint Java_org_rocksdb_DBOptions_maxSubcompactions(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_DBOptions_setMaxBackgroundFlushes( - JNIEnv*, jobject, jlong jhandle, jint max_background_flushes) { + JNIEnv*, jclass, jlong jhandle, jint max_background_flushes) { reinterpret_cast(jhandle) ->max_background_flushes = static_cast(max_background_flushes); } @@ -6473,7 +6512,7 @@ void Java_org_rocksdb_DBOptions_setMaxBackgroundFlushes( * Method: maxBackgroundFlushes * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_maxBackgroundFlushes(JNIEnv*, jobject, +jint Java_org_rocksdb_DBOptions_maxBackgroundFlushes(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_background_flushes; @@ -6484,7 +6523,7 @@ jint Java_org_rocksdb_DBOptions_maxBackgroundFlushes(JNIEnv*, jobject, * Method: setMaxBackgroundJobs * Signature: (JI)V */ -void Java_org_rocksdb_DBOptions_setMaxBackgroundJobs(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setMaxBackgroundJobs(JNIEnv*, jclass, jlong jhandle, jint max_background_jobs) { reinterpret_cast(jhandle) @@ -6496,7 +6535,7 @@ void Java_org_rocksdb_DBOptions_setMaxBackgroundJobs(JNIEnv*, jobject, * Method: maxBackgroundJobs * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_maxBackgroundJobs(JNIEnv*, jobject, +jint Java_org_rocksdb_DBOptions_maxBackgroundJobs(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_background_jobs; @@ -6507,7 +6546,7 @@ jint Java_org_rocksdb_DBOptions_maxBackgroundJobs(JNIEnv*, jobject, * Method: setMaxLogFileSize * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setMaxLogFileSize(JNIEnv* env, jobject, +void Java_org_rocksdb_DBOptions_setMaxLogFileSize(JNIEnv* env, jclass, jlong jhandle, jlong max_log_file_size) { auto s = @@ -6525,7 +6564,7 @@ void Java_org_rocksdb_DBOptions_setMaxLogFileSize(JNIEnv* env, jobject, * Method: maxLogFileSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_maxLogFileSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_maxLogFileSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_log_file_size; @@ -6537,7 +6576,7 @@ jlong Java_org_rocksdb_DBOptions_maxLogFileSize(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_DBOptions_setLogFileTimeToRoll( - JNIEnv* env, jobject, jlong jhandle, jlong log_file_time_to_roll) { + JNIEnv* env, jclass, jlong jhandle, jlong log_file_time_to_roll) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( log_file_time_to_roll); if (s.ok()) { @@ -6553,7 +6592,7 @@ void Java_org_rocksdb_DBOptions_setLogFileTimeToRoll( * Method: logFileTimeToRoll * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_logFileTimeToRoll(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_logFileTimeToRoll(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->log_file_time_to_roll; @@ -6564,7 +6603,7 @@ jlong Java_org_rocksdb_DBOptions_logFileTimeToRoll(JNIEnv*, jobject, * Method: setKeepLogFileNum * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setKeepLogFileNum(JNIEnv* env, jobject, +void Java_org_rocksdb_DBOptions_setKeepLogFileNum(JNIEnv* env, jclass, jlong jhandle, jlong keep_log_file_num) { auto s = @@ -6582,7 +6621,7 @@ void Java_org_rocksdb_DBOptions_setKeepLogFileNum(JNIEnv* env, jobject, * Method: keepLogFileNum * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_keepLogFileNum(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_keepLogFileNum(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->keep_log_file_num; @@ -6594,7 +6633,7 @@ jlong Java_org_rocksdb_DBOptions_keepLogFileNum(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_DBOptions_setRecycleLogFileNum( - JNIEnv* env, jobject, jlong jhandle, jlong recycle_log_file_num) { + JNIEnv* env, jclass, jlong jhandle, jlong recycle_log_file_num) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( recycle_log_file_num); if (s.ok()) { @@ -6610,7 +6649,7 @@ void Java_org_rocksdb_DBOptions_setRecycleLogFileNum( * Method: recycleLogFileNum * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_recycleLogFileNum(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_recycleLogFileNum(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->recycle_log_file_num; @@ -6622,7 +6661,7 @@ jlong Java_org_rocksdb_DBOptions_recycleLogFileNum(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_DBOptions_setMaxManifestFileSize( - JNIEnv*, jobject, jlong jhandle, jlong max_manifest_file_size) { + JNIEnv*, jclass, jlong jhandle, jlong max_manifest_file_size) { reinterpret_cast(jhandle) ->max_manifest_file_size = static_cast(max_manifest_file_size); } @@ -6632,7 +6671,7 @@ void Java_org_rocksdb_DBOptions_setMaxManifestFileSize( * Method: maxManifestFileSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_maxManifestFileSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_maxManifestFileSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->max_manifest_file_size; @@ -6644,7 +6683,7 @@ jlong Java_org_rocksdb_DBOptions_maxManifestFileSize(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_DBOptions_setTableCacheNumshardbits( - JNIEnv*, jobject, jlong jhandle, jint table_cache_numshardbits) { + JNIEnv*, jclass, jlong jhandle, jint table_cache_numshardbits) { reinterpret_cast(jhandle) ->table_cache_numshardbits = static_cast(table_cache_numshardbits); } @@ -6654,7 +6693,7 @@ void Java_org_rocksdb_DBOptions_setTableCacheNumshardbits( * Method: tableCacheNumshardbits * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_tableCacheNumshardbits(JNIEnv*, jobject, +jint Java_org_rocksdb_DBOptions_tableCacheNumshardbits(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->table_cache_numshardbits; @@ -6665,8 +6704,7 @@ jint Java_org_rocksdb_DBOptions_tableCacheNumshardbits(JNIEnv*, jobject, * Method: setWalTtlSeconds * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setWalTtlSeconds(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_DBOptions_setWalTtlSeconds(JNIEnv*, jclass, jlong jhandle, jlong WAL_ttl_seconds) { reinterpret_cast(jhandle)->WAL_ttl_seconds = static_cast(WAL_ttl_seconds); @@ -6677,8 +6715,7 @@ void Java_org_rocksdb_DBOptions_setWalTtlSeconds(JNIEnv*, jobject, * Method: walTtlSeconds * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_walTtlSeconds(JNIEnv*, jobject, - jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_walTtlSeconds(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->WAL_ttl_seconds; } @@ -6688,7 +6725,7 @@ jlong Java_org_rocksdb_DBOptions_walTtlSeconds(JNIEnv*, jobject, * Method: setWalSizeLimitMB * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setWalSizeLimitMB(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setWalSizeLimitMB(JNIEnv*, jclass, jlong jhandle, jlong WAL_size_limit_MB) { reinterpret_cast(jhandle)->WAL_size_limit_MB = @@ -6700,7 +6737,7 @@ void Java_org_rocksdb_DBOptions_setWalSizeLimitMB(JNIEnv*, jobject, * Method: walTtlSeconds * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_walSizeLimitMB(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_walSizeLimitMB(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->WAL_size_limit_MB; @@ -6735,7 +6772,7 @@ jlong Java_org_rocksdb_DBOptions_maxWriteBatchGroupSizeBytes(JNIEnv*, jclass, * Signature: (JJ)V */ void Java_org_rocksdb_DBOptions_setManifestPreallocationSize( - JNIEnv* env, jobject, jlong jhandle, jlong preallocation_size) { + JNIEnv* env, jclass, jlong jhandle, jlong preallocation_size) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( preallocation_size); if (s.ok()) { @@ -6751,7 +6788,7 @@ void Java_org_rocksdb_DBOptions_setManifestPreallocationSize( * Method: manifestPreallocationSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->manifest_preallocation_size; @@ -6762,7 +6799,7 @@ jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize(JNIEnv*, jobject, * Method: useDirectReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_useDirectReads(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_useDirectReads(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->use_direct_reads; @@ -6773,7 +6810,7 @@ jboolean Java_org_rocksdb_DBOptions_useDirectReads(JNIEnv*, jobject, * Method: setUseDirectReads * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setUseDirectReads(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setUseDirectReads(JNIEnv*, jclass, jlong jhandle, jboolean use_direct_reads) { reinterpret_cast(jhandle)->use_direct_reads = @@ -6786,7 +6823,7 @@ void Java_org_rocksdb_DBOptions_setUseDirectReads(JNIEnv*, jobject, * Signature: (J)Z */ jboolean Java_org_rocksdb_DBOptions_useDirectIoForFlushAndCompaction( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->use_direct_io_for_flush_and_compaction; } @@ -6797,7 +6834,7 @@ jboolean Java_org_rocksdb_DBOptions_useDirectIoForFlushAndCompaction( * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setUseDirectIoForFlushAndCompaction( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jboolean use_direct_io_for_flush_and_compaction) { reinterpret_cast(jhandle) ->use_direct_io_for_flush_and_compaction = @@ -6809,7 +6846,7 @@ void Java_org_rocksdb_DBOptions_setUseDirectIoForFlushAndCompaction( * Method: setAllowFAllocate * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setAllowFAllocate(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setAllowFAllocate(JNIEnv*, jclass, jlong jhandle, jboolean jallow_fallocate) { reinterpret_cast(jhandle)->allow_fallocate = @@ -6821,7 +6858,7 @@ void Java_org_rocksdb_DBOptions_setAllowFAllocate(JNIEnv*, jobject, * Method: allowFAllocate * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_allowFAllocate(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_allowFAllocate(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_fallocate); @@ -6832,7 +6869,7 @@ jboolean Java_org_rocksdb_DBOptions_allowFAllocate(JNIEnv*, jobject, * Method: setAllowMmapReads * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setAllowMmapReads(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setAllowMmapReads(JNIEnv*, jclass, jlong jhandle, jboolean allow_mmap_reads) { reinterpret_cast(jhandle)->allow_mmap_reads = @@ -6844,7 +6881,7 @@ void Java_org_rocksdb_DBOptions_setAllowMmapReads(JNIEnv*, jobject, * Method: allowMmapReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_allowMmapReads(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_allowMmapReads(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->allow_mmap_reads; @@ -6855,7 +6892,7 @@ jboolean Java_org_rocksdb_DBOptions_allowMmapReads(JNIEnv*, jobject, * Method: setAllowMmapWrites * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setAllowMmapWrites(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setAllowMmapWrites(JNIEnv*, jclass, jlong jhandle, jboolean allow_mmap_writes) { reinterpret_cast(jhandle)->allow_mmap_writes = @@ -6867,7 +6904,7 @@ void Java_org_rocksdb_DBOptions_setAllowMmapWrites(JNIEnv*, jobject, * Method: allowMmapWrites * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_allowMmapWrites(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_allowMmapWrites(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->allow_mmap_writes; @@ -6879,7 +6916,7 @@ jboolean Java_org_rocksdb_DBOptions_allowMmapWrites(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setIsFdCloseOnExec( - JNIEnv*, jobject, jlong jhandle, jboolean is_fd_close_on_exec) { + JNIEnv*, jclass, jlong jhandle, jboolean is_fd_close_on_exec) { reinterpret_cast(jhandle) ->is_fd_close_on_exec = static_cast(is_fd_close_on_exec); } @@ -6889,7 +6926,7 @@ void Java_org_rocksdb_DBOptions_setIsFdCloseOnExec( * Method: isFdCloseOnExec * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_isFdCloseOnExec(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_isFdCloseOnExec(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->is_fd_close_on_exec; @@ -6901,7 +6938,7 @@ jboolean Java_org_rocksdb_DBOptions_isFdCloseOnExec(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_DBOptions_setStatsDumpPeriodSec( - JNIEnv*, jobject, jlong jhandle, jint jstats_dump_period_sec) { + JNIEnv*, jclass, jlong jhandle, jint jstats_dump_period_sec) { reinterpret_cast(jhandle) ->stats_dump_period_sec = static_cast(jstats_dump_period_sec); @@ -6912,7 +6949,7 @@ void Java_org_rocksdb_DBOptions_setStatsDumpPeriodSec( * Method: statsDumpPeriodSec * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_statsDumpPeriodSec(JNIEnv*, jobject, +jint Java_org_rocksdb_DBOptions_statsDumpPeriodSec(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->stats_dump_period_sec; @@ -6924,7 +6961,7 @@ jint Java_org_rocksdb_DBOptions_statsDumpPeriodSec(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_DBOptions_setStatsPersistPeriodSec( - JNIEnv*, jobject, jlong jhandle, jint jstats_persist_period_sec) { + JNIEnv*, jclass, jlong jhandle, jint jstats_persist_period_sec) { reinterpret_cast(jhandle) ->stats_persist_period_sec = static_cast(jstats_persist_period_sec); @@ -6935,7 +6972,7 @@ void Java_org_rocksdb_DBOptions_setStatsPersistPeriodSec( * Method: statsPersistPeriodSec * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_statsPersistPeriodSec(JNIEnv*, jobject, +jint Java_org_rocksdb_DBOptions_statsPersistPeriodSec(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->stats_persist_period_sec; @@ -6947,7 +6984,7 @@ jint Java_org_rocksdb_DBOptions_statsPersistPeriodSec(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_DBOptions_setStatsHistoryBufferSize( - JNIEnv*, jobject, jlong jhandle, jlong jstats_history_buffer_size) { + JNIEnv*, jclass, jlong jhandle, jlong jstats_history_buffer_size) { reinterpret_cast(jhandle) ->stats_history_buffer_size = static_cast(jstats_history_buffer_size); @@ -6958,7 +6995,7 @@ void Java_org_rocksdb_DBOptions_setStatsHistoryBufferSize( * Method: statsHistoryBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_statsHistoryBufferSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_statsHistoryBufferSize(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->stats_history_buffer_size; @@ -6970,7 +7007,7 @@ jlong Java_org_rocksdb_DBOptions_statsHistoryBufferSize(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setAdviseRandomOnOpen( - JNIEnv*, jobject, jlong jhandle, jboolean advise_random_on_open) { + JNIEnv*, jclass, jlong jhandle, jboolean advise_random_on_open) { reinterpret_cast(jhandle) ->advise_random_on_open = static_cast(advise_random_on_open); } @@ -6980,7 +7017,7 @@ void Java_org_rocksdb_DBOptions_setAdviseRandomOnOpen( * Method: adviseRandomOnOpen * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_adviseRandomOnOpen(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_adviseRandomOnOpen(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->advise_random_on_open; @@ -6992,7 +7029,7 @@ jboolean Java_org_rocksdb_DBOptions_adviseRandomOnOpen(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_DBOptions_setDbWriteBufferSize( - JNIEnv*, jobject, jlong jhandle, jlong jdb_write_buffer_size) { + JNIEnv*, jclass, jlong jhandle, jlong jdb_write_buffer_size) { auto* opt = reinterpret_cast(jhandle); opt->db_write_buffer_size = static_cast(jdb_write_buffer_size); } @@ -7003,7 +7040,7 @@ void Java_org_rocksdb_DBOptions_setDbWriteBufferSize( * Signature: (JJ)V */ void Java_org_rocksdb_DBOptions_setWriteBufferManager( - JNIEnv*, jobject, jlong jdb_options_handle, + JNIEnv*, jclass, jlong jdb_options_handle, jlong jwrite_buffer_manager_handle) { auto* write_buffer_manager = reinterpret_cast*>( @@ -7017,43 +7054,19 @@ void Java_org_rocksdb_DBOptions_setWriteBufferManager( * Method: dbWriteBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_dbWriteBufferSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_dbWriteBufferSize(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->db_write_buffer_size); } -/* - * Class: org_rocksdb_DBOptions - * Method: setAccessHintOnCompactionStart - * Signature: (JB)V - */ -void Java_org_rocksdb_DBOptions_setAccessHintOnCompactionStart( - JNIEnv*, jobject, jlong jhandle, jbyte jaccess_hint_value) { - auto* opt = reinterpret_cast(jhandle); - opt->access_hint_on_compaction_start = - ROCKSDB_NAMESPACE::AccessHintJni::toCppAccessHint(jaccess_hint_value); -} - -/* - * Class: org_rocksdb_DBOptions - * Method: accessHintOnCompactionStart - * Signature: (J)B - */ -jbyte Java_org_rocksdb_DBOptions_accessHintOnCompactionStart(JNIEnv*, jobject, - jlong jhandle) { - auto* opt = reinterpret_cast(jhandle); - return ROCKSDB_NAMESPACE::AccessHintJni::toJavaAccessHint( - opt->access_hint_on_compaction_start); -} - /* * Class: org_rocksdb_DBOptions * Method: setCompactionReadaheadSize * Signature: (JJ)V */ void Java_org_rocksdb_DBOptions_setCompactionReadaheadSize( - JNIEnv*, jobject, jlong jhandle, jlong jcompaction_readahead_size) { + JNIEnv*, jclass, jlong jhandle, jlong jcompaction_readahead_size) { auto* opt = reinterpret_cast(jhandle); opt->compaction_readahead_size = static_cast(jcompaction_readahead_size); @@ -7064,7 +7077,7 @@ void Java_org_rocksdb_DBOptions_setCompactionReadaheadSize( * Method: compactionReadaheadSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_compactionReadaheadSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_compactionReadaheadSize(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->compaction_readahead_size); @@ -7076,7 +7089,7 @@ jlong Java_org_rocksdb_DBOptions_compactionReadaheadSize(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_DBOptions_setRandomAccessMaxBufferSize( - JNIEnv*, jobject, jlong jhandle, jlong jrandom_access_max_buffer_size) { + JNIEnv*, jclass, jlong jhandle, jlong jrandom_access_max_buffer_size) { auto* opt = reinterpret_cast(jhandle); opt->random_access_max_buffer_size = static_cast(jrandom_access_max_buffer_size); @@ -7087,7 +7100,7 @@ void Java_org_rocksdb_DBOptions_setRandomAccessMaxBufferSize( * Method: randomAccessMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_randomAccessMaxBufferSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_randomAccessMaxBufferSize(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->random_access_max_buffer_size); @@ -7099,7 +7112,7 @@ jlong Java_org_rocksdb_DBOptions_randomAccessMaxBufferSize(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_DBOptions_setWritableFileMaxBufferSize( - JNIEnv*, jobject, jlong jhandle, jlong jwritable_file_max_buffer_size) { + JNIEnv*, jclass, jlong jhandle, jlong jwritable_file_max_buffer_size) { auto* opt = reinterpret_cast(jhandle); opt->writable_file_max_buffer_size = static_cast(jwritable_file_max_buffer_size); @@ -7110,7 +7123,7 @@ void Java_org_rocksdb_DBOptions_setWritableFileMaxBufferSize( * Method: writableFileMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_writableFileMaxBufferSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_writableFileMaxBufferSize(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->writable_file_max_buffer_size); @@ -7122,7 +7135,7 @@ jlong Java_org_rocksdb_DBOptions_writableFileMaxBufferSize(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setUseAdaptiveMutex( - JNIEnv*, jobject, jlong jhandle, jboolean use_adaptive_mutex) { + JNIEnv*, jclass, jlong jhandle, jboolean use_adaptive_mutex) { reinterpret_cast(jhandle)->use_adaptive_mutex = static_cast(use_adaptive_mutex); } @@ -7132,7 +7145,7 @@ void Java_org_rocksdb_DBOptions_setUseAdaptiveMutex( * Method: useAdaptiveMutex * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_useAdaptiveMutex(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_useAdaptiveMutex(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->use_adaptive_mutex; @@ -7143,7 +7156,7 @@ jboolean Java_org_rocksdb_DBOptions_useAdaptiveMutex(JNIEnv*, jobject, * Method: setBytesPerSync * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setBytesPerSync(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setBytesPerSync(JNIEnv*, jclass, jlong jhandle, jlong bytes_per_sync) { reinterpret_cast(jhandle)->bytes_per_sync = static_cast(bytes_per_sync); @@ -7154,7 +7167,7 @@ void Java_org_rocksdb_DBOptions_setBytesPerSync(JNIEnv*, jobject, jlong jhandle, * Method: bytesPerSync * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_bytesPerSync(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_bytesPerSync(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->bytes_per_sync; } @@ -7164,7 +7177,7 @@ jlong Java_org_rocksdb_DBOptions_bytesPerSync(JNIEnv*, jobject, jlong jhandle) { * Method: setWalBytesPerSync * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setWalBytesPerSync(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setWalBytesPerSync(JNIEnv*, jclass, jlong jhandle, jlong jwal_bytes_per_sync) { reinterpret_cast(jhandle)->wal_bytes_per_sync = @@ -7176,7 +7189,7 @@ void Java_org_rocksdb_DBOptions_setWalBytesPerSync(JNIEnv*, jobject, * Method: walBytesPerSync * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_walBytesPerSync(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_walBytesPerSync(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->wal_bytes_per_sync); @@ -7188,7 +7201,7 @@ jlong Java_org_rocksdb_DBOptions_walBytesPerSync(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setStrictBytesPerSync( - JNIEnv*, jobject, jlong jhandle, jboolean jstrict_bytes_per_sync) { + JNIEnv*, jclass, jlong jhandle, jboolean jstrict_bytes_per_sync) { reinterpret_cast(jhandle) ->strict_bytes_per_sync = jstrict_bytes_per_sync == JNI_TRUE; } @@ -7198,7 +7211,7 @@ void Java_org_rocksdb_DBOptions_setStrictBytesPerSync( * Method: strictBytesPerSync * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_strictBytesPerSync(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_strictBytesPerSync(JNIEnv*, jclass, jlong jhandle) { return static_cast( reinterpret_cast(jhandle) @@ -7233,7 +7246,7 @@ jobjectArray Java_org_rocksdb_DBOptions_eventListeners(JNIEnv* env, jclass, * Method: setDelayedWriteRate * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setDelayedWriteRate(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setDelayedWriteRate(JNIEnv*, jclass, jlong jhandle, jlong jdelayed_write_rate) { auto* opt = reinterpret_cast(jhandle); @@ -7245,7 +7258,7 @@ void Java_org_rocksdb_DBOptions_setDelayedWriteRate(JNIEnv*, jobject, * Method: delayedWriteRate * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_delayedWriteRate(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_delayedWriteRate(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->delayed_write_rate); @@ -7257,7 +7270,7 @@ jlong Java_org_rocksdb_DBOptions_delayedWriteRate(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setEnablePipelinedWrite( - JNIEnv*, jobject, jlong jhandle, jboolean jenable_pipelined_write) { + JNIEnv*, jclass, jlong jhandle, jboolean jenable_pipelined_write) { auto* opt = reinterpret_cast(jhandle); opt->enable_pipelined_write = jenable_pipelined_write == JNI_TRUE; } @@ -7267,7 +7280,7 @@ void Java_org_rocksdb_DBOptions_setEnablePipelinedWrite( * Method: enablePipelinedWrite * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_enablePipelinedWrite(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_enablePipelinedWrite(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->enable_pipelined_write); @@ -7278,7 +7291,7 @@ jboolean Java_org_rocksdb_DBOptions_enablePipelinedWrite(JNIEnv*, jobject, * Method: setUnorderedWrite * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setUnorderedWrite(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setUnorderedWrite(JNIEnv*, jclass, jlong jhandle, jboolean junordered_write) { auto* opt = reinterpret_cast(jhandle); @@ -7290,7 +7303,7 @@ void Java_org_rocksdb_DBOptions_setUnorderedWrite(JNIEnv*, jobject, * Method: unorderedWrite * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_unorderedWrite(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_unorderedWrite(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->unordered_write); @@ -7302,7 +7315,7 @@ jboolean Java_org_rocksdb_DBOptions_unorderedWrite(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setEnableThreadTracking( - JNIEnv*, jobject, jlong jhandle, jboolean jenable_thread_tracking) { + JNIEnv*, jclass, jlong jhandle, jboolean jenable_thread_tracking) { auto* opt = reinterpret_cast(jhandle); opt->enable_thread_tracking = jenable_thread_tracking == JNI_TRUE; } @@ -7312,7 +7325,7 @@ void Java_org_rocksdb_DBOptions_setEnableThreadTracking( * Method: enableThreadTracking * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_enableThreadTracking(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_enableThreadTracking(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->enable_thread_tracking); @@ -7324,7 +7337,7 @@ jboolean Java_org_rocksdb_DBOptions_enableThreadTracking(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setAllowConcurrentMemtableWrite( - JNIEnv*, jobject, jlong jhandle, jboolean allow) { + JNIEnv*, jclass, jlong jhandle, jboolean allow) { reinterpret_cast(jhandle) ->allow_concurrent_memtable_write = static_cast(allow); } @@ -7335,7 +7348,7 @@ void Java_org_rocksdb_DBOptions_setAllowConcurrentMemtableWrite( * Signature: (J)Z */ jboolean Java_org_rocksdb_DBOptions_allowConcurrentMemtableWrite( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->allow_concurrent_memtable_write; } @@ -7346,7 +7359,7 @@ jboolean Java_org_rocksdb_DBOptions_allowConcurrentMemtableWrite( * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setEnableWriteThreadAdaptiveYield( - JNIEnv*, jobject, jlong jhandle, jboolean yield) { + JNIEnv*, jclass, jlong jhandle, jboolean yield) { reinterpret_cast(jhandle) ->enable_write_thread_adaptive_yield = static_cast(yield); } @@ -7357,7 +7370,7 @@ void Java_org_rocksdb_DBOptions_setEnableWriteThreadAdaptiveYield( * Signature: (J)Z */ jboolean Java_org_rocksdb_DBOptions_enableWriteThreadAdaptiveYield( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->enable_write_thread_adaptive_yield; } @@ -7367,7 +7380,7 @@ jboolean Java_org_rocksdb_DBOptions_enableWriteThreadAdaptiveYield( * Method: setWriteThreadMaxYieldUsec * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setWriteThreadMaxYieldUsec(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setWriteThreadMaxYieldUsec(JNIEnv*, jclass, jlong jhandle, jlong max) { reinterpret_cast(jhandle) @@ -7379,7 +7392,7 @@ void Java_org_rocksdb_DBOptions_setWriteThreadMaxYieldUsec(JNIEnv*, jobject, * Method: writeThreadMaxYieldUsec * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_writeThreadMaxYieldUsec(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_writeThreadMaxYieldUsec(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->write_thread_max_yield_usec; @@ -7390,7 +7403,7 @@ jlong Java_org_rocksdb_DBOptions_writeThreadMaxYieldUsec(JNIEnv*, jobject, * Method: setWriteThreadSlowYieldUsec * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setWriteThreadSlowYieldUsec(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setWriteThreadSlowYieldUsec(JNIEnv*, jclass, jlong jhandle, jlong slow) { reinterpret_cast(jhandle) @@ -7402,7 +7415,7 @@ void Java_org_rocksdb_DBOptions_setWriteThreadSlowYieldUsec(JNIEnv*, jobject, * Method: writeThreadSlowYieldUsec * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_writeThreadSlowYieldUsec(JNIEnv*, jobject, +jlong Java_org_rocksdb_DBOptions_writeThreadSlowYieldUsec(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->write_thread_slow_yield_usec; @@ -7414,7 +7427,7 @@ jlong Java_org_rocksdb_DBOptions_writeThreadSlowYieldUsec(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setSkipStatsUpdateOnDbOpen( - JNIEnv*, jobject, jlong jhandle, jboolean jskip_stats_update_on_db_open) { + JNIEnv*, jclass, jlong jhandle, jboolean jskip_stats_update_on_db_open) { auto* opt = reinterpret_cast(jhandle); opt->skip_stats_update_on_db_open = static_cast(jskip_stats_update_on_db_open); @@ -7425,7 +7438,7 @@ void Java_org_rocksdb_DBOptions_setSkipStatsUpdateOnDbOpen( * Method: skipStatsUpdateOnDbOpen * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->skip_stats_update_on_db_open); @@ -7461,7 +7474,7 @@ jboolean Java_org_rocksdb_DBOptions_skipCheckingSstFileSizesOnDbOpen( * Signature: (JB)V */ void Java_org_rocksdb_DBOptions_setWalRecoveryMode( - JNIEnv*, jobject, jlong jhandle, jbyte jwal_recovery_mode_value) { + JNIEnv*, jclass, jlong jhandle, jbyte jwal_recovery_mode_value) { auto* opt = reinterpret_cast(jhandle); opt->wal_recovery_mode = ROCKSDB_NAMESPACE::WALRecoveryModeJni::toCppWALRecoveryMode( @@ -7473,7 +7486,7 @@ void Java_org_rocksdb_DBOptions_setWalRecoveryMode( * Method: walRecoveryMode * Signature: (J)B */ -jbyte Java_org_rocksdb_DBOptions_walRecoveryMode(JNIEnv*, jobject, +jbyte Java_org_rocksdb_DBOptions_walRecoveryMode(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::WALRecoveryModeJni::toJavaWALRecoveryMode( @@ -7485,7 +7498,7 @@ jbyte Java_org_rocksdb_DBOptions_walRecoveryMode(JNIEnv*, jobject, * Method: setAllow2pc * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setAllow2pc(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setAllow2pc(JNIEnv*, jclass, jlong jhandle, jboolean jallow_2pc) { auto* opt = reinterpret_cast(jhandle); opt->allow_2pc = static_cast(jallow_2pc); @@ -7496,7 +7509,7 @@ void Java_org_rocksdb_DBOptions_setAllow2pc(JNIEnv*, jobject, jlong jhandle, * Method: allow2pc * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_allow2pc(JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_allow2pc(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_2pc); } @@ -7506,7 +7519,7 @@ jboolean Java_org_rocksdb_DBOptions_allow2pc(JNIEnv*, jobject, jlong jhandle) { * Method: setRowCache * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setRowCache(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setRowCache(JNIEnv*, jclass, jlong jhandle, jlong jrow_cache_handle) { auto* opt = reinterpret_cast(jhandle); auto* row_cache = @@ -7520,7 +7533,7 @@ void Java_org_rocksdb_DBOptions_setRowCache(JNIEnv*, jobject, jlong jhandle, * Method: setWalFilter * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setWalFilter(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setWalFilter(JNIEnv*, jclass, jlong jhandle, jlong jwal_filter_handle) { auto* opt = reinterpret_cast(jhandle); auto* wal_filter = reinterpret_cast( @@ -7534,7 +7547,7 @@ void Java_org_rocksdb_DBOptions_setWalFilter(JNIEnv*, jobject, jlong jhandle, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setFailIfOptionsFileError( - JNIEnv*, jobject, jlong jhandle, jboolean jfail_if_options_file_error) { + JNIEnv*, jclass, jlong jhandle, jboolean jfail_if_options_file_error) { auto* opt = reinterpret_cast(jhandle); opt->fail_if_options_file_error = static_cast(jfail_if_options_file_error); @@ -7545,7 +7558,7 @@ void Java_org_rocksdb_DBOptions_setFailIfOptionsFileError( * Method: failIfOptionsFileError * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_failIfOptionsFileError(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_failIfOptionsFileError(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->fail_if_options_file_error); @@ -7557,7 +7570,7 @@ jboolean Java_org_rocksdb_DBOptions_failIfOptionsFileError(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setDumpMallocStats( - JNIEnv*, jobject, jlong jhandle, jboolean jdump_malloc_stats) { + JNIEnv*, jclass, jlong jhandle, jboolean jdump_malloc_stats) { auto* opt = reinterpret_cast(jhandle); opt->dump_malloc_stats = static_cast(jdump_malloc_stats); } @@ -7567,7 +7580,7 @@ void Java_org_rocksdb_DBOptions_setDumpMallocStats( * Method: dumpMallocStats * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_dumpMallocStats(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_dumpMallocStats(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->dump_malloc_stats); @@ -7579,7 +7592,7 @@ jboolean Java_org_rocksdb_DBOptions_dumpMallocStats(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setAvoidFlushDuringRecovery( - JNIEnv*, jobject, jlong jhandle, jboolean javoid_flush_during_recovery) { + JNIEnv*, jclass, jlong jhandle, jboolean javoid_flush_during_recovery) { auto* opt = reinterpret_cast(jhandle); opt->avoid_flush_during_recovery = static_cast(javoid_flush_during_recovery); @@ -7590,7 +7603,7 @@ void Java_org_rocksdb_DBOptions_setAvoidFlushDuringRecovery( * Method: avoidFlushDuringRecovery * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringRecovery(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringRecovery(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->avoid_flush_during_recovery); @@ -7602,7 +7615,7 @@ jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringRecovery(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setAllowIngestBehind( - JNIEnv*, jobject, jlong jhandle, jboolean jallow_ingest_behind) { + JNIEnv*, jclass, jlong jhandle, jboolean jallow_ingest_behind) { auto* opt = reinterpret_cast(jhandle); opt->allow_ingest_behind = jallow_ingest_behind == JNI_TRUE; } @@ -7612,7 +7625,7 @@ void Java_org_rocksdb_DBOptions_setAllowIngestBehind( * Method: allowIngestBehind * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_allowIngestBehind(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_allowIngestBehind(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_ingest_behind); @@ -7623,7 +7636,7 @@ jboolean Java_org_rocksdb_DBOptions_allowIngestBehind(JNIEnv*, jobject, * Method: setTwoWriteQueues * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setTwoWriteQueues(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setTwoWriteQueues(JNIEnv*, jclass, jlong jhandle, jboolean jtwo_write_queues) { auto* opt = reinterpret_cast(jhandle); @@ -7635,7 +7648,7 @@ void Java_org_rocksdb_DBOptions_setTwoWriteQueues(JNIEnv*, jobject, * Method: twoWriteQueues * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_twoWriteQueues(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_twoWriteQueues(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->two_write_queues); @@ -7646,7 +7659,7 @@ jboolean Java_org_rocksdb_DBOptions_twoWriteQueues(JNIEnv*, jobject, * Method: setManualWalFlush * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setManualWalFlush(JNIEnv*, jobject, +void Java_org_rocksdb_DBOptions_setManualWalFlush(JNIEnv*, jclass, jlong jhandle, jboolean jmanual_wal_flush) { auto* opt = reinterpret_cast(jhandle); @@ -7658,7 +7671,7 @@ void Java_org_rocksdb_DBOptions_setManualWalFlush(JNIEnv*, jobject, * Method: manualWalFlush * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_manualWalFlush(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_manualWalFlush(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->manual_wal_flush); @@ -7669,7 +7682,7 @@ jboolean Java_org_rocksdb_DBOptions_manualWalFlush(JNIEnv*, jobject, * Method: setAtomicFlush * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setAtomicFlush(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_DBOptions_setAtomicFlush(JNIEnv*, jclass, jlong jhandle, jboolean jatomic_flush) { auto* opt = reinterpret_cast(jhandle); opt->atomic_flush = jatomic_flush == JNI_TRUE; @@ -7680,7 +7693,7 @@ void Java_org_rocksdb_DBOptions_setAtomicFlush(JNIEnv*, jobject, jlong jhandle, * Method: atomicFlush * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_atomicFlush(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_atomicFlush(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->atomic_flush); @@ -7692,7 +7705,7 @@ jboolean Java_org_rocksdb_DBOptions_atomicFlush(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setAvoidFlushDuringShutdown( - JNIEnv*, jobject, jlong jhandle, jboolean javoid_flush_during_shutdown) { + JNIEnv*, jclass, jlong jhandle, jboolean javoid_flush_during_shutdown) { auto* opt = reinterpret_cast(jhandle); opt->avoid_flush_during_shutdown = static_cast(javoid_flush_during_shutdown); @@ -7703,7 +7716,7 @@ void Java_org_rocksdb_DBOptions_setAvoidFlushDuringShutdown( * Method: avoidFlushDuringShutdown * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringShutdown(JNIEnv*, jobject, +jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringShutdown(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->avoid_flush_during_shutdown); @@ -7895,8 +7908,8 @@ jlong Java_org_rocksdb_WriteOptions_copyWriteOptions(JNIEnv*, jclass, * Method: disposeInternal * Signature: ()V */ -void Java_org_rocksdb_WriteOptions_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_WriteOptions_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { auto* write_options = reinterpret_cast(jhandle); assert(write_options != nullptr); @@ -7908,7 +7921,7 @@ void Java_org_rocksdb_WriteOptions_disposeInternal(JNIEnv*, jobject, * Method: setSync * Signature: (JZ)V */ -void Java_org_rocksdb_WriteOptions_setSync(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_WriteOptions_setSync(JNIEnv*, jclass, jlong jhandle, jboolean jflag) { reinterpret_cast(jhandle)->sync = jflag; } @@ -7918,7 +7931,7 @@ void Java_org_rocksdb_WriteOptions_setSync(JNIEnv*, jobject, jlong jhandle, * Method: sync * Signature: (J)Z */ -jboolean Java_org_rocksdb_WriteOptions_sync(JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_WriteOptions_sync(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->sync; } @@ -7927,8 +7940,7 @@ jboolean Java_org_rocksdb_WriteOptions_sync(JNIEnv*, jobject, jlong jhandle) { * Method: setDisableWAL * Signature: (JZ)V */ -void Java_org_rocksdb_WriteOptions_setDisableWAL(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_WriteOptions_setDisableWAL(JNIEnv*, jclass, jlong jhandle, jboolean jflag) { reinterpret_cast(jhandle)->disableWAL = jflag; @@ -7939,7 +7951,7 @@ void Java_org_rocksdb_WriteOptions_setDisableWAL(JNIEnv*, jobject, * Method: disableWAL * Signature: (J)Z */ -jboolean Java_org_rocksdb_WriteOptions_disableWAL(JNIEnv*, jobject, +jboolean Java_org_rocksdb_WriteOptions_disableWAL(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->disableWAL; @@ -7951,7 +7963,7 @@ jboolean Java_org_rocksdb_WriteOptions_disableWAL(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_WriteOptions_setIgnoreMissingColumnFamilies( - JNIEnv*, jobject, jlong jhandle, jboolean jignore_missing_column_families) { + JNIEnv*, jclass, jlong jhandle, jboolean jignore_missing_column_families) { reinterpret_cast(jhandle) ->ignore_missing_column_families = static_cast(jignore_missing_column_families); @@ -7963,7 +7975,7 @@ void Java_org_rocksdb_WriteOptions_setIgnoreMissingColumnFamilies( * Signature: (J)Z */ jboolean Java_org_rocksdb_WriteOptions_ignoreMissingColumnFamilies( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->ignore_missing_column_families; } @@ -7973,8 +7985,7 @@ jboolean Java_org_rocksdb_WriteOptions_ignoreMissingColumnFamilies( * Method: setNoSlowdown * Signature: (JZ)V */ -void Java_org_rocksdb_WriteOptions_setNoSlowdown(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_WriteOptions_setNoSlowdown(JNIEnv*, jclass, jlong jhandle, jboolean jno_slowdown) { reinterpret_cast(jhandle)->no_slowdown = static_cast(jno_slowdown); @@ -7985,7 +7996,7 @@ void Java_org_rocksdb_WriteOptions_setNoSlowdown(JNIEnv*, jobject, * Method: noSlowdown * Signature: (J)Z */ -jboolean Java_org_rocksdb_WriteOptions_noSlowdown(JNIEnv*, jobject, +jboolean Java_org_rocksdb_WriteOptions_noSlowdown(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->no_slowdown; @@ -7996,7 +8007,7 @@ jboolean Java_org_rocksdb_WriteOptions_noSlowdown(JNIEnv*, jobject, * Method: setLowPri * Signature: (JZ)V */ -void Java_org_rocksdb_WriteOptions_setLowPri(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_WriteOptions_setLowPri(JNIEnv*, jclass, jlong jhandle, jboolean jlow_pri) { reinterpret_cast(jhandle)->low_pri = static_cast(jlow_pri); @@ -8007,7 +8018,7 @@ void Java_org_rocksdb_WriteOptions_setLowPri(JNIEnv*, jobject, jlong jhandle, * Method: lowPri * Signature: (J)Z */ -jboolean Java_org_rocksdb_WriteOptions_lowPri(JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_WriteOptions_lowPri(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->low_pri; } @@ -8017,7 +8028,7 @@ jboolean Java_org_rocksdb_WriteOptions_lowPri(JNIEnv*, jobject, jlong jhandle) { * Signature: (J)Z */ jboolean Java_org_rocksdb_WriteOptions_memtableInsertHintPerBatch( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->memtable_insert_hint_per_batch; } @@ -8028,7 +8039,7 @@ jboolean Java_org_rocksdb_WriteOptions_memtableInsertHintPerBatch( * Signature: (JZ)V */ void Java_org_rocksdb_WriteOptions_setMemtableInsertHintPerBatch( - JNIEnv*, jobject, jlong jhandle, jboolean jmemtable_insert_hint_per_batch) { + JNIEnv*, jclass, jlong jhandle, jboolean jmemtable_insert_hint_per_batch) { reinterpret_cast(jhandle) ->memtable_insert_hint_per_batch = static_cast(jmemtable_insert_hint_per_batch); @@ -8076,8 +8087,8 @@ jlong Java_org_rocksdb_ReadOptions_copyReadOptions(JNIEnv*, jclass, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_ReadOptions_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_ReadOptions_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { auto* read_options = reinterpret_cast(jhandle); assert(read_options != nullptr); @@ -8090,7 +8101,7 @@ void Java_org_rocksdb_ReadOptions_disposeInternal(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_ReadOptions_setVerifyChecksums( - JNIEnv*, jobject, jlong jhandle, jboolean jverify_checksums) { + JNIEnv*, jclass, jlong jhandle, jboolean jverify_checksums) { reinterpret_cast(jhandle)->verify_checksums = static_cast(jverify_checksums); } @@ -8100,7 +8111,7 @@ void Java_org_rocksdb_ReadOptions_setVerifyChecksums( * Method: verifyChecksums * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_verifyChecksums(JNIEnv*, jobject, +jboolean Java_org_rocksdb_ReadOptions_verifyChecksums(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->verify_checksums; @@ -8111,7 +8122,7 @@ jboolean Java_org_rocksdb_ReadOptions_verifyChecksums(JNIEnv*, jobject, * Method: setFillCache * Signature: (JZ)V */ -void Java_org_rocksdb_ReadOptions_setFillCache(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_ReadOptions_setFillCache(JNIEnv*, jclass, jlong jhandle, jboolean jfill_cache) { reinterpret_cast(jhandle)->fill_cache = static_cast(jfill_cache); @@ -8122,7 +8133,7 @@ void Java_org_rocksdb_ReadOptions_setFillCache(JNIEnv*, jobject, jlong jhandle, * Method: fillCache * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_fillCache(JNIEnv*, jobject, +jboolean Java_org_rocksdb_ReadOptions_fillCache(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->fill_cache; } @@ -8132,7 +8143,7 @@ jboolean Java_org_rocksdb_ReadOptions_fillCache(JNIEnv*, jobject, * Method: setTailing * Signature: (JZ)V */ -void Java_org_rocksdb_ReadOptions_setTailing(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_ReadOptions_setTailing(JNIEnv*, jclass, jlong jhandle, jboolean jtailing) { reinterpret_cast(jhandle)->tailing = static_cast(jtailing); @@ -8143,7 +8154,7 @@ void Java_org_rocksdb_ReadOptions_setTailing(JNIEnv*, jobject, jlong jhandle, * Method: tailing * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_tailing(JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ReadOptions_tailing(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->tailing; } @@ -8152,7 +8163,7 @@ jboolean Java_org_rocksdb_ReadOptions_tailing(JNIEnv*, jobject, jlong jhandle) { * Method: managed * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_managed(JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ReadOptions_managed(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->managed; } @@ -8161,7 +8172,7 @@ jboolean Java_org_rocksdb_ReadOptions_managed(JNIEnv*, jobject, jlong jhandle) { * Method: setManaged * Signature: (JZ)V */ -void Java_org_rocksdb_ReadOptions_setManaged(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_ReadOptions_setManaged(JNIEnv*, jclass, jlong jhandle, jboolean jmanaged) { reinterpret_cast(jhandle)->managed = static_cast(jmanaged); @@ -8172,7 +8183,7 @@ void Java_org_rocksdb_ReadOptions_setManaged(JNIEnv*, jobject, jlong jhandle, * Method: totalOrderSeek * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_totalOrderSeek(JNIEnv*, jobject, +jboolean Java_org_rocksdb_ReadOptions_totalOrderSeek(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->total_order_seek; @@ -8184,7 +8195,7 @@ jboolean Java_org_rocksdb_ReadOptions_totalOrderSeek(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_ReadOptions_setTotalOrderSeek( - JNIEnv*, jobject, jlong jhandle, jboolean jtotal_order_seek) { + JNIEnv*, jclass, jlong jhandle, jboolean jtotal_order_seek) { reinterpret_cast(jhandle)->total_order_seek = static_cast(jtotal_order_seek); } @@ -8194,7 +8205,7 @@ void Java_org_rocksdb_ReadOptions_setTotalOrderSeek( * Method: prefixSameAsStart * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_prefixSameAsStart(JNIEnv*, jobject, +jboolean Java_org_rocksdb_ReadOptions_prefixSameAsStart(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle) ->prefix_same_as_start; @@ -8206,7 +8217,7 @@ jboolean Java_org_rocksdb_ReadOptions_prefixSameAsStart(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_ReadOptions_setPrefixSameAsStart( - JNIEnv*, jobject, jlong jhandle, jboolean jprefix_same_as_start) { + JNIEnv*, jclass, jlong jhandle, jboolean jprefix_same_as_start) { reinterpret_cast(jhandle) ->prefix_same_as_start = static_cast(jprefix_same_as_start); } @@ -8216,7 +8227,7 @@ void Java_org_rocksdb_ReadOptions_setPrefixSameAsStart( * Method: pinData * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_pinData(JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ReadOptions_pinData(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->pin_data; } @@ -8225,7 +8236,7 @@ jboolean Java_org_rocksdb_ReadOptions_pinData(JNIEnv*, jobject, jlong jhandle) { * Method: setPinData * Signature: (JZ)V */ -void Java_org_rocksdb_ReadOptions_setPinData(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_ReadOptions_setPinData(JNIEnv*, jclass, jlong jhandle, jboolean jpin_data) { reinterpret_cast(jhandle)->pin_data = static_cast(jpin_data); @@ -8237,7 +8248,7 @@ void Java_org_rocksdb_ReadOptions_setPinData(JNIEnv*, jobject, jlong jhandle, * Signature: (J)Z */ jboolean Java_org_rocksdb_ReadOptions_backgroundPurgeOnIteratorCleanup( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->background_purge_on_iterator_cleanup); } @@ -8248,7 +8259,7 @@ jboolean Java_org_rocksdb_ReadOptions_backgroundPurgeOnIteratorCleanup( * Signature: (JZ)V */ void Java_org_rocksdb_ReadOptions_setBackgroundPurgeOnIteratorCleanup( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jboolean jbackground_purge_on_iterator_cleanup) { auto* opt = reinterpret_cast(jhandle); opt->background_purge_on_iterator_cleanup = @@ -8260,7 +8271,7 @@ void Java_org_rocksdb_ReadOptions_setBackgroundPurgeOnIteratorCleanup( * Method: readaheadSize * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_readaheadSize(JNIEnv*, jobject, +jlong Java_org_rocksdb_ReadOptions_readaheadSize(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->readahead_size); @@ -8271,7 +8282,7 @@ jlong Java_org_rocksdb_ReadOptions_readaheadSize(JNIEnv*, jobject, * Method: setReadaheadSize * Signature: (JJ)V */ -void Java_org_rocksdb_ReadOptions_setReadaheadSize(JNIEnv*, jobject, +void Java_org_rocksdb_ReadOptions_setReadaheadSize(JNIEnv*, jclass, jlong jhandle, jlong jreadahead_size) { auto* opt = reinterpret_cast(jhandle); @@ -8283,7 +8294,7 @@ void Java_org_rocksdb_ReadOptions_setReadaheadSize(JNIEnv*, jobject, * Method: maxSkippableInternalKeys * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_maxSkippableInternalKeys(JNIEnv*, jobject, +jlong Java_org_rocksdb_ReadOptions_maxSkippableInternalKeys(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->max_skippable_internal_keys); @@ -8295,7 +8306,7 @@ jlong Java_org_rocksdb_ReadOptions_maxSkippableInternalKeys(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_ReadOptions_setMaxSkippableInternalKeys( - JNIEnv*, jobject, jlong jhandle, jlong jmax_skippable_internal_keys) { + JNIEnv*, jclass, jlong jhandle, jlong jmax_skippable_internal_keys) { auto* opt = reinterpret_cast(jhandle); opt->max_skippable_internal_keys = static_cast(jmax_skippable_internal_keys); @@ -8306,7 +8317,7 @@ void Java_org_rocksdb_ReadOptions_setMaxSkippableInternalKeys( * Method: ignoreRangeDeletions * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_ignoreRangeDeletions(JNIEnv*, jobject, +jboolean Java_org_rocksdb_ReadOptions_ignoreRangeDeletions(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->ignore_range_deletions); @@ -8318,7 +8329,7 @@ jboolean Java_org_rocksdb_ReadOptions_ignoreRangeDeletions(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_ReadOptions_setIgnoreRangeDeletions( - JNIEnv*, jobject, jlong jhandle, jboolean jignore_range_deletions) { + JNIEnv*, jclass, jlong jhandle, jboolean jignore_range_deletions) { auto* opt = reinterpret_cast(jhandle); opt->ignore_range_deletions = static_cast(jignore_range_deletions); } @@ -8328,7 +8339,7 @@ void Java_org_rocksdb_ReadOptions_setIgnoreRangeDeletions( * Method: setSnapshot * Signature: (JJ)V */ -void Java_org_rocksdb_ReadOptions_setSnapshot(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_ReadOptions_setSnapshot(JNIEnv*, jclass, jlong jhandle, jlong jsnapshot) { reinterpret_cast(jhandle)->snapshot = reinterpret_cast(jsnapshot); @@ -8339,7 +8350,7 @@ void Java_org_rocksdb_ReadOptions_setSnapshot(JNIEnv*, jobject, jlong jhandle, * Method: snapshot * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_snapshot(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ReadOptions_snapshot(JNIEnv*, jclass, jlong jhandle) { auto& snapshot = reinterpret_cast(jhandle)->snapshot; return GET_CPLUSPLUS_POINTER(snapshot); @@ -8350,7 +8361,7 @@ jlong Java_org_rocksdb_ReadOptions_snapshot(JNIEnv*, jobject, jlong jhandle) { * Method: readTier * Signature: (J)B */ -jbyte Java_org_rocksdb_ReadOptions_readTier(JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_ReadOptions_readTier(JNIEnv*, jclass, jlong jhandle) { return static_cast( reinterpret_cast(jhandle)->read_tier); } @@ -8360,7 +8371,7 @@ jbyte Java_org_rocksdb_ReadOptions_readTier(JNIEnv*, jobject, jlong jhandle) { * Method: setReadTier * Signature: (JB)V */ -void Java_org_rocksdb_ReadOptions_setReadTier(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_ReadOptions_setReadTier(JNIEnv*, jclass, jlong jhandle, jbyte jread_tier) { reinterpret_cast(jhandle)->read_tier = static_cast(jread_tier); @@ -8372,7 +8383,7 @@ void Java_org_rocksdb_ReadOptions_setReadTier(JNIEnv*, jobject, jlong jhandle, * Signature: (JJ)I */ void Java_org_rocksdb_ReadOptions_setIterateUpperBound( - JNIEnv*, jobject, jlong jhandle, jlong jupper_bound_slice_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jupper_bound_slice_handle) { reinterpret_cast(jhandle) ->iterate_upper_bound = reinterpret_cast(jupper_bound_slice_handle); @@ -8383,7 +8394,7 @@ void Java_org_rocksdb_ReadOptions_setIterateUpperBound( * Method: iterateUpperBound * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_iterateUpperBound(JNIEnv*, jobject, +jlong Java_org_rocksdb_ReadOptions_iterateUpperBound(JNIEnv*, jclass, jlong jhandle) { auto& upper_bound_slice_handle = reinterpret_cast(jhandle) @@ -8397,7 +8408,7 @@ jlong Java_org_rocksdb_ReadOptions_iterateUpperBound(JNIEnv*, jobject, * Signature: (JJ)I */ void Java_org_rocksdb_ReadOptions_setIterateLowerBound( - JNIEnv*, jobject, jlong jhandle, jlong jlower_bound_slice_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jlower_bound_slice_handle) { reinterpret_cast(jhandle) ->iterate_lower_bound = reinterpret_cast(jlower_bound_slice_handle); @@ -8408,7 +8419,7 @@ void Java_org_rocksdb_ReadOptions_setIterateLowerBound( * Method: iterateLowerBound * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_iterateLowerBound(JNIEnv*, jobject, +jlong Java_org_rocksdb_ReadOptions_iterateLowerBound(JNIEnv*, jclass, jlong jhandle) { auto& lower_bound_slice_handle = reinterpret_cast(jhandle) @@ -8422,7 +8433,7 @@ jlong Java_org_rocksdb_ReadOptions_iterateLowerBound(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_ReadOptions_setTableFilter( - JNIEnv*, jobject, jlong jhandle, jlong jjni_table_filter_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jjni_table_filter_handle) { auto* opt = reinterpret_cast(jhandle); auto* jni_table_filter = reinterpret_cast( @@ -8435,7 +8446,7 @@ void Java_org_rocksdb_ReadOptions_setTableFilter( * Method: autoPrefixMode * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_autoPrefixMode(JNIEnv*, jobject, +jboolean Java_org_rocksdb_ReadOptions_autoPrefixMode(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->auto_prefix_mode); @@ -8447,7 +8458,7 @@ jboolean Java_org_rocksdb_ReadOptions_autoPrefixMode(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_ReadOptions_setAutoPrefixMode( - JNIEnv*, jobject, jlong jhandle, jboolean jauto_prefix_mode) { + JNIEnv*, jclass, jlong jhandle, jboolean jauto_prefix_mode) { auto* opt = reinterpret_cast(jhandle); opt->auto_prefix_mode = static_cast(jauto_prefix_mode); } @@ -8457,7 +8468,7 @@ void Java_org_rocksdb_ReadOptions_setAutoPrefixMode( * Method: timestamp * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_timestamp(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ReadOptions_timestamp(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); auto& timestamp_slice_handle = opt->timestamp; return reinterpret_cast(timestamp_slice_handle); @@ -8468,7 +8479,7 @@ jlong Java_org_rocksdb_ReadOptions_timestamp(JNIEnv*, jobject, jlong jhandle) { * Method: setTimestamp * Signature: (JJ)V */ -void Java_org_rocksdb_ReadOptions_setTimestamp(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_ReadOptions_setTimestamp(JNIEnv*, jclass, jlong jhandle, jlong jtimestamp_slice_handle) { auto* opt = reinterpret_cast(jhandle); opt->timestamp = @@ -8480,8 +8491,7 @@ void Java_org_rocksdb_ReadOptions_setTimestamp(JNIEnv*, jobject, jlong jhandle, * Method: iterStartTs * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_iterStartTs(JNIEnv*, jobject, - jlong jhandle) { +jlong Java_org_rocksdb_ReadOptions_iterStartTs(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); auto& iter_start_ts_handle = opt->iter_start_ts; return reinterpret_cast(iter_start_ts_handle); @@ -8492,8 +8502,7 @@ jlong Java_org_rocksdb_ReadOptions_iterStartTs(JNIEnv*, jobject, * Method: setIterStartTs * Signature: (JJ)V */ -void Java_org_rocksdb_ReadOptions_setIterStartTs(JNIEnv*, jobject, - jlong jhandle, +void Java_org_rocksdb_ReadOptions_setIterStartTs(JNIEnv*, jclass, jlong jhandle, jlong jiter_start_ts_handle) { auto* opt = reinterpret_cast(jhandle); opt->iter_start_ts = @@ -8505,7 +8514,7 @@ void Java_org_rocksdb_ReadOptions_setIterStartTs(JNIEnv*, jobject, * Method: deadline * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_deadline(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ReadOptions_deadline(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->deadline.count()); } @@ -8515,7 +8524,7 @@ jlong Java_org_rocksdb_ReadOptions_deadline(JNIEnv*, jobject, jlong jhandle) { * Method: setDeadline * Signature: (JJ)V */ -void Java_org_rocksdb_ReadOptions_setDeadline(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_ReadOptions_setDeadline(JNIEnv*, jclass, jlong jhandle, jlong jdeadline) { auto* opt = reinterpret_cast(jhandle); opt->deadline = std::chrono::microseconds(static_cast(jdeadline)); @@ -8526,7 +8535,7 @@ void Java_org_rocksdb_ReadOptions_setDeadline(JNIEnv*, jobject, jlong jhandle, * Method: ioTimeout * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_ioTimeout(JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ReadOptions_ioTimeout(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->io_timeout.count()); } @@ -8536,7 +8545,7 @@ jlong Java_org_rocksdb_ReadOptions_ioTimeout(JNIEnv*, jobject, jlong jhandle) { * Method: setIoTimeout * Signature: (JJ)V */ -void Java_org_rocksdb_ReadOptions_setIoTimeout(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_ReadOptions_setIoTimeout(JNIEnv*, jclass, jlong jhandle, jlong jio_timeout) { auto* opt = reinterpret_cast(jhandle); opt->io_timeout = @@ -8548,7 +8557,7 @@ void Java_org_rocksdb_ReadOptions_setIoTimeout(JNIEnv*, jobject, jlong jhandle, * Method: valueSizeSofLimit * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_valueSizeSoftLimit(JNIEnv*, jobject, +jlong Java_org_rocksdb_ReadOptions_valueSizeSoftLimit(JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->value_size_soft_limit); @@ -8560,11 +8569,32 @@ jlong Java_org_rocksdb_ReadOptions_valueSizeSoftLimit(JNIEnv*, jobject, * Signature: (JJ)V */ void Java_org_rocksdb_ReadOptions_setValueSizeSoftLimit( - JNIEnv*, jobject, jlong jhandle, jlong jvalue_size_soft_limit) { + JNIEnv*, jclass, jlong jhandle, jlong jvalue_size_soft_limit) { auto* opt = reinterpret_cast(jhandle); opt->value_size_soft_limit = static_cast(jvalue_size_soft_limit); } +/* + * Class: org_rocksdb_ReadOptions + * Method: asyncIo + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ReadOptions_asyncIo(JNIEnv*, jobject, jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->async_io); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setAsyncIo + * Signature: (JZ)V + */ +void Java_org_rocksdb_ReadOptions_setAsyncIo(JNIEnv*, jobject, jlong jhandle, + jboolean jasync_io) { + auto* opt = reinterpret_cast(jhandle); + opt->async_io = static_cast(jasync_io); +} + ///////////////////////////////////////////////////////////////////// // ROCKSDB_NAMESPACE::ComparatorOptions @@ -8584,7 +8614,7 @@ jlong Java_org_rocksdb_ComparatorOptions_newComparatorOptions(JNIEnv*, jclass) { * Signature: (J)B */ jbyte Java_org_rocksdb_ComparatorOptions_reusedSynchronisationType( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* comparator_opt = reinterpret_cast( jhandle); @@ -8599,7 +8629,7 @@ jbyte Java_org_rocksdb_ComparatorOptions_reusedSynchronisationType( * Signature: (JB)V */ void Java_org_rocksdb_ComparatorOptions_setReusedSynchronisationType( - JNIEnv*, jobject, jlong jhandle, jbyte jreused_synhcronisation_type) { + JNIEnv*, jclass, jlong jhandle, jbyte jreused_synhcronisation_type) { auto* comparator_opt = reinterpret_cast( jhandle); @@ -8613,7 +8643,7 @@ void Java_org_rocksdb_ComparatorOptions_setReusedSynchronisationType( * Method: useDirectBuffer * Signature: (J)Z */ -jboolean Java_org_rocksdb_ComparatorOptions_useDirectBuffer(JNIEnv*, jobject, +jboolean Java_org_rocksdb_ComparatorOptions_useDirectBuffer(JNIEnv*, jclass, jlong jhandle) { return static_cast( reinterpret_cast( @@ -8627,7 +8657,7 @@ jboolean Java_org_rocksdb_ComparatorOptions_useDirectBuffer(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_ComparatorOptions_setUseDirectBuffer( - JNIEnv*, jobject, jlong jhandle, jboolean jdirect_buffer) { + JNIEnv*, jclass, jlong jhandle, jboolean jdirect_buffer) { reinterpret_cast(jhandle) ->direct_buffer = jdirect_buffer == JNI_TRUE; } @@ -8637,7 +8667,7 @@ void Java_org_rocksdb_ComparatorOptions_setUseDirectBuffer( * Method: maxReusedBufferSize * Signature: (J)I */ -jint Java_org_rocksdb_ComparatorOptions_maxReusedBufferSize(JNIEnv*, jobject, +jint Java_org_rocksdb_ComparatorOptions_maxReusedBufferSize(JNIEnv*, jclass, jlong jhandle) { return static_cast( reinterpret_cast( @@ -8651,7 +8681,7 @@ jint Java_org_rocksdb_ComparatorOptions_maxReusedBufferSize(JNIEnv*, jobject, * Signature: (JI)V */ void Java_org_rocksdb_ComparatorOptions_setMaxReusedBufferSize( - JNIEnv*, jobject, jlong jhandle, jint jmax_reused_buffer_size) { + JNIEnv*, jclass, jlong jhandle, jint jmax_reused_buffer_size) { reinterpret_cast(jhandle) ->max_reused_buffer_size = static_cast(jmax_reused_buffer_size); } @@ -8661,8 +8691,8 @@ void Java_org_rocksdb_ComparatorOptions_setMaxReusedBufferSize( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_ComparatorOptions_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_ComparatorOptions_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { auto* comparator_opt = reinterpret_cast( jhandle); @@ -8688,7 +8718,7 @@ jlong Java_org_rocksdb_FlushOptions_newFlushOptions(JNIEnv*, jclass) { * Method: setWaitForFlush * Signature: (JZ)V */ -void Java_org_rocksdb_FlushOptions_setWaitForFlush(JNIEnv*, jobject, +void Java_org_rocksdb_FlushOptions_setWaitForFlush(JNIEnv*, jclass, jlong jhandle, jboolean jwait) { reinterpret_cast(jhandle)->wait = @@ -8700,7 +8730,7 @@ void Java_org_rocksdb_FlushOptions_setWaitForFlush(JNIEnv*, jobject, * Method: waitForFlush * Signature: (J)Z */ -jboolean Java_org_rocksdb_FlushOptions_waitForFlush(JNIEnv*, jobject, +jboolean Java_org_rocksdb_FlushOptions_waitForFlush(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->wait; } @@ -8711,7 +8741,7 @@ jboolean Java_org_rocksdb_FlushOptions_waitForFlush(JNIEnv*, jobject, * Signature: (JZ)V */ void Java_org_rocksdb_FlushOptions_setAllowWriteStall( - JNIEnv*, jobject, jlong jhandle, jboolean jallow_write_stall) { + JNIEnv*, jclass, jlong jhandle, jboolean jallow_write_stall) { auto* flush_options = reinterpret_cast(jhandle); flush_options->allow_write_stall = jallow_write_stall == JNI_TRUE; @@ -8722,7 +8752,7 @@ void Java_org_rocksdb_FlushOptions_setAllowWriteStall( * Method: allowWriteStall * Signature: (J)Z */ -jboolean Java_org_rocksdb_FlushOptions_allowWriteStall(JNIEnv*, jobject, +jboolean Java_org_rocksdb_FlushOptions_allowWriteStall(JNIEnv*, jclass, jlong jhandle) { auto* flush_options = reinterpret_cast(jhandle); @@ -8734,8 +8764,8 @@ jboolean Java_org_rocksdb_FlushOptions_allowWriteStall(JNIEnv*, jobject, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_FlushOptions_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_FlushOptions_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { auto* flush_opt = reinterpret_cast(jhandle); assert(flush_opt != nullptr); delete flush_opt; diff --git a/java/rocksjni/persistent_cache.cc b/java/rocksjni/persistent_cache.cc index 295d9179804..2b019ee6e1c 100644 --- a/java/rocksjni/persistent_cache.cc +++ b/java/rocksjni/persistent_cache.cc @@ -51,8 +51,8 @@ jlong Java_org_rocksdb_PersistentCache_newPersistentCache( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_PersistentCache_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_PersistentCache_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { auto* cache = reinterpret_cast*>( jhandle); diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 840956dae9b..8a95b995e56 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -222,6 +222,64 @@ class IllegalArgumentExceptionJni } }; +// The portal class for java.lang.IllegalArgumentException +class OutOfMemoryErrorJni : public JavaException { + public: + /** + * Get the Java Class java.lang.OutOfMemoryError + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaException::getJClass(env, "java/lang/OutOfMemoryError"); + } + + /** + * Create and throw a Java OutOfMemoryError with the provided message + * + * @param env A pointer to the Java environment + * @param msg The message for the exception + * + * @return true if an exception was thrown, false otherwise + */ + static bool ThrowNew(JNIEnv* env, const std::string& msg) { + return JavaException::ThrowNew(env, msg); + } + + /** + * Create and throw a Java OutOfMemoryError with the provided status + * + * If s.ok() == true, then this function will not throw any exception. + * + * @param env A pointer to the Java environment + * @param s The status for the exception + * + * @return true if an exception was thrown, false otherwise + */ + static bool ThrowNew(JNIEnv* env, const Status& s) { + assert(!s.ok()); + if (s.ok()) { + return false; + } + + // get the OutOfMemoryError class + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + std::cerr << "OutOfMemoryErrorJni::ThrowNew/class - Error: " + "unexpected exception!" + << std::endl; + return env->ExceptionCheck(); + } + + return JavaException::ThrowNew(env, s.ToString()); + } +}; + // The portal class for org.rocksdb.Status.Code class CodeJni : public JavaClass { public: @@ -2140,8 +2198,8 @@ class JniUtil { std::function op, - JNIEnv* env, jobject /*jobj*/, jbyteArray jkey, jint jkey_len, - jbyteArray jvalue, jint jvalue_len) { + JNIEnv* env, jbyteArray jkey, jint jkey_len, jbyteArray jvalue, + jint jvalue_len) { jbyte* key = env->GetByteArrayElements(jkey, nullptr); if (env->ExceptionCheck()) { // exception thrown: OutOfMemoryError @@ -2182,7 +2240,7 @@ class JniUtil { */ static std::unique_ptr k_op( std::function op, - JNIEnv* env, jobject /*jobj*/, jbyteArray jkey, jint jkey_len) { + JNIEnv* env, jbyteArray jkey, jint jkey_len) { jbyte* key = env->GetByteArrayElements(jkey, nullptr); if (env->ExceptionCheck()) { // exception thrown: OutOfMemoryError @@ -4730,48 +4788,6 @@ class CompactionPriorityJni { } }; -// The portal class for org.rocksdb.AccessHint -class AccessHintJni { - public: - // Returns the equivalent org.rocksdb.AccessHint for the provided - // C++ ROCKSDB_NAMESPACE::DBOptions::AccessHint enum - static jbyte toJavaAccessHint( - const ROCKSDB_NAMESPACE::DBOptions::AccessHint& access_hint) { - switch (access_hint) { - case ROCKSDB_NAMESPACE::DBOptions::AccessHint::NONE: - return 0x0; - case ROCKSDB_NAMESPACE::DBOptions::AccessHint::NORMAL: - return 0x1; - case ROCKSDB_NAMESPACE::DBOptions::AccessHint::SEQUENTIAL: - return 0x2; - case ROCKSDB_NAMESPACE::DBOptions::AccessHint::WILLNEED: - return 0x3; - default: - // undefined/default - return 0x1; - } - } - - // Returns the equivalent C++ ROCKSDB_NAMESPACE::DBOptions::AccessHint enum - // for the provided Java org.rocksdb.AccessHint - static ROCKSDB_NAMESPACE::DBOptions::AccessHint toCppAccessHint( - jbyte jaccess_hint) { - switch (jaccess_hint) { - case 0x0: - return ROCKSDB_NAMESPACE::DBOptions::AccessHint::NONE; - case 0x1: - return ROCKSDB_NAMESPACE::DBOptions::AccessHint::NORMAL; - case 0x2: - return ROCKSDB_NAMESPACE::DBOptions::AccessHint::SEQUENTIAL; - case 0x3: - return ROCKSDB_NAMESPACE::DBOptions::AccessHint::WILLNEED; - default: - // undefined/default - return ROCKSDB_NAMESPACE::DBOptions::AccessHint::NORMAL; - } - } -}; - // The portal class for org.rocksdb.WALRecoveryMode class WALRecoveryModeJni { public: @@ -4839,352 +4855,430 @@ class TickerTypeJni { case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_BYTES_INSERT: return 0x7; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_MISS: - return 0x9; + return 0x8; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_HIT: - return 0xA; + return 0x9; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_ADD: - return 0xB; + return 0xA; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_BYTES_INSERT: - return 0xC; + return 0xB; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_MISS: - return 0xE; + return 0xC; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_HIT: - return 0xF; + return 0xD; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_ADD: - return 0x10; + return 0xE; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_BYTES_INSERT: - return 0x11; + return 0xF; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_BYTES_READ: - return 0x12; + return 0x10; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_BYTES_WRITE: + return 0x11; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSION_DICT_MISS: + return 0x12; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSION_DICT_HIT: return 0x13; - case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_USEFUL: + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSION_DICT_ADD: return 0x14; - case ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_HIT: + case ROCKSDB_NAMESPACE::Tickers:: + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT: return 0x15; - case ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_MISS: + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_ADD_REDUNDANT: return 0x16; - case ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_HIT: + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_ADD_REDUNDANT: return 0x17; - case ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_MISS: + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_ADD_REDUNDANT: return 0x18; - case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_HIT: + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_ADD_REDUNDANT: return 0x19; - case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_MISS: + case ROCKSDB_NAMESPACE::Tickers:: + BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT: return 0x1A; - case ROCKSDB_NAMESPACE::Tickers::GET_HIT_L0: + case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_HITS: return 0x1B; - case ROCKSDB_NAMESPACE::Tickers::GET_HIT_L1: + case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_FILTER_HITS: return 0x1C; - case ROCKSDB_NAMESPACE::Tickers::GET_HIT_L2_AND_UP: + case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_INDEX_HITS: return 0x1D; - case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_NEWER_ENTRY: + case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS: return 0x1E; - case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_OBSOLETE: + case ROCKSDB_NAMESPACE::Tickers::COMPRESSED_SECONDARY_CACHE_DUMMY_HITS: return 0x1F; - case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_RANGE_DEL: + case ROCKSDB_NAMESPACE::Tickers::COMPRESSED_SECONDARY_CACHE_HITS: return 0x20; - case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_USER: + case ROCKSDB_NAMESPACE::Tickers::COMPRESSED_SECONDARY_CACHE_PROMOTIONS: return 0x21; - case ROCKSDB_NAMESPACE::Tickers::COMPACTION_RANGE_DEL_DROP_OBSOLETE: + case ROCKSDB_NAMESPACE::Tickers:: + COMPRESSED_SECONDARY_CACHE_PROMOTION_SKIPS: return 0x22; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_WRITTEN: + case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_USEFUL: return 0x23; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_READ: + case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_POSITIVE: return 0x24; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_UPDATED: + case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_TRUE_POSITIVE: return 0x25; - case ROCKSDB_NAMESPACE::Tickers::BYTES_WRITTEN: + case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_CHECKED: return 0x26; - case ROCKSDB_NAMESPACE::Tickers::BYTES_READ: + case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_USEFUL: return 0x27; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK: + case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_TRUE_POSITIVE: return 0x28; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT: + case ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_HIT: return 0x29; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV: + case ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_MISS: return 0x2A; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK_FOUND: + case ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_HIT: return 0x2B; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT_FOUND: + case ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_MISS: return 0x2C; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV_FOUND: + case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_HIT: return 0x2D; - case ROCKSDB_NAMESPACE::Tickers::ITER_BYTES_READ: + case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_MISS: return 0x2E; - case ROCKSDB_NAMESPACE::Tickers::NO_FILE_OPENS: + case ROCKSDB_NAMESPACE::Tickers::GET_HIT_L0: + return 0x2F; + case ROCKSDB_NAMESPACE::Tickers::GET_HIT_L1: return 0x30; - case ROCKSDB_NAMESPACE::Tickers::NO_FILE_ERRORS: + case ROCKSDB_NAMESPACE::Tickers::GET_HIT_L2_AND_UP: return 0x31; - case ROCKSDB_NAMESPACE::Tickers::STALL_MICROS: + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_NEWER_ENTRY: + return 0x32; + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_OBSOLETE: + return 0x33; + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_RANGE_DEL: + return 0x34; + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_USER: return 0x35; - case ROCKSDB_NAMESPACE::Tickers::DB_MUTEX_WAIT_MICROS: + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_RANGE_DEL_DROP_OBSOLETE: return 0x36; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_CALLS: + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE: + return 0x37; + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_CANCELLED: + return 0x38; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_WRITTEN: return 0x39; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_READ: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_READ: return 0x3A; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_BYTES_READ: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_UPDATED: return 0x3B; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_MERGE_FAILURES: + case ROCKSDB_NAMESPACE::Tickers::BYTES_WRITTEN: + return 0x3C; + case ROCKSDB_NAMESPACE::Tickers::BYTES_READ: return 0x3D; - case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_CHECKED: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK: return 0x3E; - case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_USEFUL: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT: return 0x3F; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV: return 0x40; - case ROCKSDB_NAMESPACE::Tickers::GET_UPDATES_SINCE_CALLS: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK_FOUND: return 0x41; - case ROCKSDB_NAMESPACE::Tickers::WAL_FILE_SYNCED: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT_FOUND: + return 0x42; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV_FOUND: + return 0x43; + case ROCKSDB_NAMESPACE::Tickers::ITER_BYTES_READ: + return 0x44; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_ITER_SKIP: + return 0x45; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION: return 0x46; - case ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES: + case ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_CREATED: return 0x47; - case ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_SELF: + case ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_DELETED: return 0x48; - case ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_OTHER: + case ROCKSDB_NAMESPACE::Tickers::NO_FILE_OPENS: return 0x49; - case ROCKSDB_NAMESPACE::Tickers::WRITE_WITH_WAL: + case ROCKSDB_NAMESPACE::Tickers::NO_FILE_ERRORS: + return 0x4A; + case ROCKSDB_NAMESPACE::Tickers::STALL_MICROS: return 0x4B; - case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES: + case ROCKSDB_NAMESPACE::Tickers::DB_MUTEX_WAIT_MICROS: return 0x4C; - case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_CALLS: return 0x4D; - case ROCKSDB_NAMESPACE::Tickers::FLUSH_WRITE_BYTES: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_READ: return 0x4E; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_DIRECT_LOAD_TABLE_PROPERTIES: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_BYTES_READ: return 0x4F; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_ACQUIRES: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_FOUND: return 0x50; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_RELEASES: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_MERGE_FAILURES: return 0x51; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_CLEANUPS: + case ROCKSDB_NAMESPACE::Tickers::GET_UPDATES_SINCE_CALLS: return 0x52; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_COMPRESSED: + case ROCKSDB_NAMESPACE::Tickers::WAL_FILE_SYNCED: return 0x53; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_DECOMPRESSED: + case ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES: return 0x54; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_NOT_COMPRESSED: + case ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_SELF: return 0x55; - case ROCKSDB_NAMESPACE::Tickers::MERGE_OPERATION_TOTAL_TIME: + case ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_OTHER: return 0x56; - case ROCKSDB_NAMESPACE::Tickers::FILTER_OPERATION_TOTAL_TIME: + case ROCKSDB_NAMESPACE::Tickers::WRITE_WITH_WAL: return 0x57; - case ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_HIT: + case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES: return 0x58; - case ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_MISS: + case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES: return 0x59; - case ROCKSDB_NAMESPACE::Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES: + case ROCKSDB_NAMESPACE::Tickers::FLUSH_WRITE_BYTES: return 0x5A; - case ROCKSDB_NAMESPACE::Tickers::READ_AMP_TOTAL_READ_BYTES: + case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_MARKED: return 0x5B; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_RATE_LIMITER_DRAINS: + case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_PERIODIC: return 0x5C; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_ITER_SKIP: + case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_TTL: return 0x5D; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_FOUND: + case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_MARKED: return 0x5E; - case ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_CREATED: - // -0x01 so we can skip over the already taken 0x5F (TICKER_ENUM_MAX). - return -0x01; - case ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_DELETED: + case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC: + return 0x5F; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL: return 0x60; - case ROCKSDB_NAMESPACE::Tickers::COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_DIRECT_LOAD_TABLE_PROPERTIES: return 0x61; - case ROCKSDB_NAMESPACE::Tickers::COMPACTION_CANCELLED: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_ACQUIRES: return 0x62; - case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_POSITIVE: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_RELEASES: return 0x63; - case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_TRUE_POSITIVE: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_CLEANUPS: return 0x64; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PUT: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_COMPRESSED: return 0x65; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_WRITE: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_DECOMPRESSED: return 0x66; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_GET: + case ROCKSDB_NAMESPACE::Tickers::BYTES_COMPRESSED_FROM: return 0x67; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_MULTIGET: + case ROCKSDB_NAMESPACE::Tickers::BYTES_COMPRESSED_TO: return 0x68; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_SEEK: + case ROCKSDB_NAMESPACE::Tickers::BYTES_COMPRESSION_BYPASSED: return 0x69; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_NEXT: + case ROCKSDB_NAMESPACE::Tickers::BYTES_COMPRESSION_REJECTED: return 0x6A; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PREV: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_COMPRESSION_BYPASSED: return 0x6B; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_WRITTEN: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_COMPRESSION_REJECTED: return 0x6C; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_READ: + case ROCKSDB_NAMESPACE::Tickers::BYTES_DECOMPRESSED_FROM: return 0x6D; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_WRITTEN: + case ROCKSDB_NAMESPACE::Tickers::BYTES_DECOMPRESSED_TO: return 0x6E; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ: + case ROCKSDB_NAMESPACE::Tickers::MERGE_OPERATION_TOTAL_TIME: return 0x6F; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED: + case ROCKSDB_NAMESPACE::Tickers::FILTER_OPERATION_TOTAL_TIME: return 0x70; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL: + case ROCKSDB_NAMESPACE::Tickers::COMPACTION_CPU_TOTAL_TIME: return 0x71; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB: + case ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_HIT: return 0x72; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB_TTL: + case ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_MISS: return 0x73; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_WRITTEN: + case ROCKSDB_NAMESPACE::Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES: return 0x74; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_READ: + case ROCKSDB_NAMESPACE::Tickers::READ_AMP_TOTAL_READ_BYTES: return 0x75; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_SYNCED: + case ROCKSDB_NAMESPACE::Tickers::NUMBER_RATE_LIMITER_DRAINS: return 0x76; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_COUNT: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PUT: return 0x77; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_SIZE: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_WRITE: return 0x78; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_COUNT: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_GET: return 0x79; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_SIZE: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_MULTIGET: return 0x7A; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_FILES: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_SEEK: return 0x7B; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_NEW_FILES: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_NEXT: return 0x7C; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_FAILURES: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PREV: return 0x7D; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_RELOCATED: - return -0x02; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_RELOCATED: - return -0x05; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_FILES_EVICTED: - return -0x06; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_KEYS_EVICTED: - return -0x07; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_BYTES_EVICTED: - return -0x08; - case ROCKSDB_NAMESPACE::Tickers::TXN_PREPARE_MUTEX_OVERHEAD: - return -0x09; - case ROCKSDB_NAMESPACE::Tickers::TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD: - return -0x0A; - case ROCKSDB_NAMESPACE::Tickers::TXN_DUPLICATE_KEY_OVERHEAD: - return -0x0B; - case ROCKSDB_NAMESPACE::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD: - return -0x0C; - case ROCKSDB_NAMESPACE::Tickers::TXN_GET_TRY_AGAIN: - return -0x0D; - case ROCKSDB_NAMESPACE::Tickers::FILES_MARKED_TRASH: - return -0x0E; - case ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_IMMEDIATELY: - return -0X0F; - case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_MARKED: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_WRITTEN: + return 0x7E; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_READ: + return 0x7F; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_WRITTEN: + return -0x1; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ: + return -0x2; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED: + return -0x3; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL: + return -0x4; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB: + return -0x5; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB_TTL: + return -0x6; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_WRITTEN: + return -0x7; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_READ: + return -0x8; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_SYNCED: + return -0x9; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_COUNT: + return -0xA; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_SIZE: + return -0xB; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_COUNT: + return -0xC; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_SIZE: + return -0xD; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_FILES: + return -0xE; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_NEW_FILES: + return -0xF; + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_FAILURES: return -0x10; - case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_PERIODIC: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_RELOCATED: return -0x11; - case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_TTL: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_RELOCATED: return -0x12; - case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_MARKED: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_FILES_EVICTED: return -0x13; - case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_KEYS_EVICTED: return -0x14; - case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_BYTES_EVICTED: return -0x15; - case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_MISS: return -0x16; - case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_HIT: return -0x17; - case ROCKSDB_NAMESPACE::Tickers:: - ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD: return -0x18; - case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD_FAILURES: return -0x19; - case ROCKSDB_NAMESPACE::Tickers:: - ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_READ: return -0x1A; - case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_WRITE: return -0x1B; - case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH: + case ROCKSDB_NAMESPACE::Tickers::TXN_PREPARE_MUTEX_OVERHEAD: return -0x1C; - case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH: + case ROCKSDB_NAMESPACE::Tickers::TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD: return -0x1D; - case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_HITS: + case ROCKSDB_NAMESPACE::Tickers::TXN_DUPLICATE_KEY_OVERHEAD: return -0x1E; - case ROCKSDB_NAMESPACE::Tickers::VERIFY_CHECKSUM_READ_BYTES: + case ROCKSDB_NAMESPACE::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD: return -0x1F; - case ROCKSDB_NAMESPACE::Tickers::BACKUP_READ_BYTES: + case ROCKSDB_NAMESPACE::Tickers::TXN_GET_TRY_AGAIN: return -0x20; - case ROCKSDB_NAMESPACE::Tickers::BACKUP_WRITE_BYTES: + case ROCKSDB_NAMESPACE::Tickers::FILES_MARKED_TRASH: return -0x21; - case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES: + case ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_FROM_TRASH_QUEUE: return -0x22; - case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES: + case ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_IMMEDIATELY: return -0x23; - case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES: + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT: return -0x24; - case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES: + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT: return -0x25; - case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES: + case ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT: return -0x26; - case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT: + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT: return -0x27; - case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT: + case ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT: return -0x28; - case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT: + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT: return -0x29; - case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_BYTES: + case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH: return -0x2A; - case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_COUNT: + case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH: return -0x2B; - case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_BYTES: + case ROCKSDB_NAMESPACE::Tickers::VERIFY_CHECKSUM_READ_BYTES: return -0x2C; - case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_COUNT: + case ROCKSDB_NAMESPACE::Tickers::BACKUP_READ_BYTES: return -0x2D; - case ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_COMPUTE_COUNT: + case ROCKSDB_NAMESPACE::Tickers::BACKUP_WRITE_BYTES: return -0x2E; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_MISS: + case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES: return -0x2F; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_HIT: + case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES: return -0x30; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD: + case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES: return -0x31; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD_FAILURES: + case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES: return -0x32; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_READ: + case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES: return -0x33; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_WRITE: + case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT: return -0x34; - case ROCKSDB_NAMESPACE::Tickers::READ_ASYNC_MICROS: + case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT: return -0x35; - case ROCKSDB_NAMESPACE::Tickers::ASYNC_READ_ERROR_COUNT: + case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT: return -0x36; - case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_FILTER_HITS: + case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_BYTES: return -0x37; - case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_INDEX_HITS: + case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_COUNT: return -0x38; - case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS: + case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_BYTES: return -0x39; - case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS: + case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_COUNT: return -0x3A; - case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT: + case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_SEEK_FILTERED: return -0x3B; - case ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_MISMATCH_COUNT: + case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_SEEK_FILTER_MATCH: return -0x3C; - case ROCKSDB_NAMESPACE::Tickers::READAHEAD_TRIMMED: + case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_SEEK_DATA: return -0x3D; - case ROCKSDB_NAMESPACE::Tickers::FIFO_MAX_SIZE_COMPACTIONS: + case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER: return -0x3E; - case ROCKSDB_NAMESPACE::Tickers::FIFO_TTL_COMPACTIONS: + case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH: return -0x3F; - case ROCKSDB_NAMESPACE::Tickers::PREFETCH_BYTES: + case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_SEEK_FILTERED: return -0x40; - case ROCKSDB_NAMESPACE::Tickers::PREFETCH_BYTES_USEFUL: + case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_SEEK_FILTER_MATCH: return -0x41; - case ROCKSDB_NAMESPACE::Tickers::PREFETCH_HITS: + case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_SEEK_DATA: return -0x42; + case ROCKSDB_NAMESPACE::Tickers:: + NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER: + return -0x43; + case ROCKSDB_NAMESPACE::Tickers:: + NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH: + return -0x44; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_COMPUTE_COUNT: + return -0x45; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_MISMATCH_COUNT: + return -0x46; + case ROCKSDB_NAMESPACE::Tickers::MULTIGET_COROUTINE_COUNT: + return -0x47; + case ROCKSDB_NAMESPACE::Tickers::READ_ASYNC_MICROS: + return -0x48; + case ROCKSDB_NAMESPACE::Tickers::ASYNC_READ_ERROR_COUNT: + return -0x49; + case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS: + return -0x4A; + case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT: + return -0x4B; + case ROCKSDB_NAMESPACE::Tickers::TIMESTAMP_FILTER_TABLE_CHECKED: + return -0x4C; + case ROCKSDB_NAMESPACE::Tickers::TIMESTAMP_FILTER_TABLE_FILTERED: + return -0x4D; + case ROCKSDB_NAMESPACE::Tickers::READAHEAD_TRIMMED: + return -0x4E; + case ROCKSDB_NAMESPACE::Tickers::FIFO_MAX_SIZE_COMPACTIONS: + return -0x4F; + case ROCKSDB_NAMESPACE::Tickers::FIFO_TTL_COMPACTIONS: + return -0x50; + case ROCKSDB_NAMESPACE::Tickers::PREFETCH_BYTES: + return -0x51; + case ROCKSDB_NAMESPACE::Tickers::PREFETCH_BYTES_USEFUL: + return -0x52; + case ROCKSDB_NAMESPACE::Tickers::PREFETCH_HITS: + return -0x53; + case ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT: + return -0x55; case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: - // 0x5F was the max value in the initial copy of tickers to Java. - // Since these values are exposed directly to Java clients, we keep - // the value the same forever. + // -0x54 is the max value at this time. Since these values are exposed + // directly to Java clients, we'll keep the value the same till the next + // major release. // // TODO: This particular case seems confusing and unnecessary to pin the // value since it's meant to be the number of tickers, not an actual // ticker value. But we aren't yet in a position to fix it since the // number of tickers doesn't fit in the Java representation (jbyte). - return 0x5F; + return -0x54; default: // undefined/default return 0x0; @@ -5211,349 +5305,431 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_ADD; case 0x7: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_BYTES_INSERT; - case 0x9: + case 0x8: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_MISS; - case 0xA: + case 0x9: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_HIT; - case 0xB: + case 0xA: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_ADD; - case 0xC: + case 0xB: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_BYTES_INSERT; - case 0xE: + case 0xC: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_MISS; - case 0xF: + case 0xD: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_HIT; - case 0x10: + case 0xE: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_ADD; - case 0x11: + case 0xF: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_BYTES_INSERT; - case 0x12: + case 0x10: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_BYTES_READ; - case 0x13: + case 0x11: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_BYTES_WRITE; + case 0x12: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSION_DICT_MISS; + case 0x13: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSION_DICT_HIT; case 0x14: - return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_USEFUL; + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSION_DICT_ADD; case 0x15: - return ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_HIT; + return ROCKSDB_NAMESPACE::Tickers:: + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT; case 0x16: - return ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_MISS; + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_ADD_REDUNDANT; case 0x17: - return ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_HIT; + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_ADD_REDUNDANT; case 0x18: - return ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_MISS; + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_ADD_REDUNDANT; case 0x19: - return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_HIT; + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_ADD_REDUNDANT; case 0x1A: - return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_MISS; + return ROCKSDB_NAMESPACE::Tickers:: + BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT; case 0x1B: - return ROCKSDB_NAMESPACE::Tickers::GET_HIT_L0; + return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_HITS; case 0x1C: - return ROCKSDB_NAMESPACE::Tickers::GET_HIT_L1; + return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_FILTER_HITS; case 0x1D: - return ROCKSDB_NAMESPACE::Tickers::GET_HIT_L2_AND_UP; + return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_INDEX_HITS; case 0x1E: - return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_NEWER_ENTRY; + return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS; case 0x1F: - return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_OBSOLETE; + return ROCKSDB_NAMESPACE::Tickers:: + COMPRESSED_SECONDARY_CACHE_DUMMY_HITS; case 0x20: - return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_RANGE_DEL; + return ROCKSDB_NAMESPACE::Tickers::COMPRESSED_SECONDARY_CACHE_HITS; case 0x21: - return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_USER; + return ROCKSDB_NAMESPACE::Tickers:: + COMPRESSED_SECONDARY_CACHE_PROMOTIONS; case 0x22: - return ROCKSDB_NAMESPACE::Tickers::COMPACTION_RANGE_DEL_DROP_OBSOLETE; + return ROCKSDB_NAMESPACE::Tickers:: + COMPRESSED_SECONDARY_CACHE_PROMOTION_SKIPS; case 0x23: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_WRITTEN; + return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_USEFUL; case 0x24: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_READ; + return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_POSITIVE; case 0x25: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_UPDATED; + return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_TRUE_POSITIVE; case 0x26: - return ROCKSDB_NAMESPACE::Tickers::BYTES_WRITTEN; + return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_CHECKED; case 0x27: - return ROCKSDB_NAMESPACE::Tickers::BYTES_READ; + return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_USEFUL; case 0x28: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK; + return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_TRUE_POSITIVE; case 0x29: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT; + return ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_HIT; case 0x2A: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV; + return ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_MISS; case 0x2B: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK_FOUND; + return ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_HIT; case 0x2C: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT_FOUND; + return ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_MISS; case 0x2D: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV_FOUND; + return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_HIT; case 0x2E: - return ROCKSDB_NAMESPACE::Tickers::ITER_BYTES_READ; + return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_MISS; + case 0x2F: + return ROCKSDB_NAMESPACE::Tickers::GET_HIT_L0; case 0x30: - return ROCKSDB_NAMESPACE::Tickers::NO_FILE_OPENS; + return ROCKSDB_NAMESPACE::Tickers::GET_HIT_L1; case 0x31: - return ROCKSDB_NAMESPACE::Tickers::NO_FILE_ERRORS; + return ROCKSDB_NAMESPACE::Tickers::GET_HIT_L2_AND_UP; + case 0x32: + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_NEWER_ENTRY; + case 0x33: + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_OBSOLETE; + case 0x34: + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_RANGE_DEL; case 0x35: - return ROCKSDB_NAMESPACE::Tickers::STALL_MICROS; + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_USER; case 0x36: - return ROCKSDB_NAMESPACE::Tickers::DB_MUTEX_WAIT_MICROS; + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_RANGE_DEL_DROP_OBSOLETE; + case 0x37: + return ROCKSDB_NAMESPACE::Tickers:: + COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE; + case 0x38: + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_CANCELLED; case 0x39: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_CALLS; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_WRITTEN; case 0x3A: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_READ; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_READ; case 0x3B: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_BYTES_READ; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_UPDATED; + case 0x3C: + return ROCKSDB_NAMESPACE::Tickers::BYTES_WRITTEN; case 0x3D: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_MERGE_FAILURES; + return ROCKSDB_NAMESPACE::Tickers::BYTES_READ; case 0x3E: - return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_CHECKED; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK; case 0x3F: - return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_USEFUL; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT; case 0x40: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV; case 0x41: - return ROCKSDB_NAMESPACE::Tickers::GET_UPDATES_SINCE_CALLS; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK_FOUND; + case 0x42: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT_FOUND; + case 0x43: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV_FOUND; + case 0x44: + return ROCKSDB_NAMESPACE::Tickers::ITER_BYTES_READ; + case 0x45: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_ITER_SKIP; case 0x46: - return ROCKSDB_NAMESPACE::Tickers::WAL_FILE_SYNCED; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION; case 0x47: - return ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES; + return ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_CREATED; case 0x48: - return ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_SELF; + return ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_DELETED; case 0x49: - return ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_OTHER; + return ROCKSDB_NAMESPACE::Tickers::NO_FILE_OPENS; + case 0x4A: + return ROCKSDB_NAMESPACE::Tickers::NO_FILE_ERRORS; case 0x4B: - return ROCKSDB_NAMESPACE::Tickers::WRITE_WITH_WAL; + return ROCKSDB_NAMESPACE::Tickers::STALL_MICROS; case 0x4C: - return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES; + return ROCKSDB_NAMESPACE::Tickers::DB_MUTEX_WAIT_MICROS; case 0x4D: - return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_CALLS; case 0x4E: - return ROCKSDB_NAMESPACE::Tickers::FLUSH_WRITE_BYTES; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_READ; case 0x4F: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_DIRECT_LOAD_TABLE_PROPERTIES; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_BYTES_READ; case 0x50: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_ACQUIRES; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_FOUND; case 0x51: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_RELEASES; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_MERGE_FAILURES; case 0x52: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_CLEANUPS; + return ROCKSDB_NAMESPACE::Tickers::GET_UPDATES_SINCE_CALLS; case 0x53: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_COMPRESSED; + return ROCKSDB_NAMESPACE::Tickers::WAL_FILE_SYNCED; case 0x54: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_DECOMPRESSED; + return ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES; case 0x55: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_NOT_COMPRESSED; + return ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_SELF; case 0x56: - return ROCKSDB_NAMESPACE::Tickers::MERGE_OPERATION_TOTAL_TIME; + return ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_OTHER; case 0x57: - return ROCKSDB_NAMESPACE::Tickers::FILTER_OPERATION_TOTAL_TIME; + return ROCKSDB_NAMESPACE::Tickers::WRITE_WITH_WAL; case 0x58: - return ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_HIT; + return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES; case 0x59: - return ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_MISS; + return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES; case 0x5A: - return ROCKSDB_NAMESPACE::Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES; + return ROCKSDB_NAMESPACE::Tickers::FLUSH_WRITE_BYTES; case 0x5B: - return ROCKSDB_NAMESPACE::Tickers::READ_AMP_TOTAL_READ_BYTES; + return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_MARKED; case 0x5C: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_RATE_LIMITER_DRAINS; + return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_PERIODIC; case 0x5D: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_ITER_SKIP; + return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_TTL; case 0x5E: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_FOUND; - case -0x01: - // -0x01 so we can skip over the already taken 0x5F (TICKER_ENUM_MAX). - return ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_CREATED; + return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_MARKED; + case 0x5F: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC; case 0x60: - return ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_DELETED; + return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL; case 0x61: - return ROCKSDB_NAMESPACE::Tickers:: - COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_DIRECT_LOAD_TABLE_PROPERTIES; case 0x62: - return ROCKSDB_NAMESPACE::Tickers::COMPACTION_CANCELLED; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_ACQUIRES; case 0x63: - return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_POSITIVE; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_RELEASES; case 0x64: - return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_TRUE_POSITIVE; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_CLEANUPS; case 0x65: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PUT; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_COMPRESSED; case 0x66: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_WRITE; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_DECOMPRESSED; case 0x67: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_GET; + return ROCKSDB_NAMESPACE::Tickers::BYTES_COMPRESSED_FROM; case 0x68: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_MULTIGET; + return ROCKSDB_NAMESPACE::Tickers::BYTES_COMPRESSED_TO; case 0x69: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_SEEK; + return ROCKSDB_NAMESPACE::Tickers::BYTES_COMPRESSION_BYPASSED; case 0x6A: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_NEXT; + return ROCKSDB_NAMESPACE::Tickers::BYTES_COMPRESSION_REJECTED; case 0x6B: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PREV; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_COMPRESSION_BYPASSED; case 0x6C: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_WRITTEN; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_COMPRESSION_REJECTED; case 0x6D: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_READ; + return ROCKSDB_NAMESPACE::Tickers::BYTES_DECOMPRESSED_FROM; case 0x6E: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_WRITTEN; + return ROCKSDB_NAMESPACE::Tickers::BYTES_DECOMPRESSED_TO; case 0x6F: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ; + return ROCKSDB_NAMESPACE::Tickers::MERGE_OPERATION_TOTAL_TIME; case 0x70: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED; + return ROCKSDB_NAMESPACE::Tickers::FILTER_OPERATION_TOTAL_TIME; case 0x71: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL; + return ROCKSDB_NAMESPACE::Tickers::COMPACTION_CPU_TOTAL_TIME; case 0x72: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB; + return ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_HIT; case 0x73: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB_TTL; + return ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_MISS; case 0x74: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_WRITTEN; + return ROCKSDB_NAMESPACE::Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES; case 0x75: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_READ; + return ROCKSDB_NAMESPACE::Tickers::READ_AMP_TOTAL_READ_BYTES; case 0x76: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_SYNCED; + return ROCKSDB_NAMESPACE::Tickers::NUMBER_RATE_LIMITER_DRAINS; case 0x77: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_COUNT; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PUT; case 0x78: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_SIZE; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_WRITE; case 0x79: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_COUNT; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_GET; case 0x7A: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_SIZE; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_MULTIGET; case 0x7B: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_FILES; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_SEEK; case 0x7C: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_NEW_FILES; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_NEXT; case 0x7D: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_FAILURES; - case -0x02: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_RELOCATED; - case -0x05: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_RELOCATED; - case -0x06: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_FILES_EVICTED; - case -0x07: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_KEYS_EVICTED; - case -0x08: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_BYTES_EVICTED; - case -0x09: - return ROCKSDB_NAMESPACE::Tickers::TXN_PREPARE_MUTEX_OVERHEAD; - case -0x0A: - return ROCKSDB_NAMESPACE::Tickers::TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD; - case -0x0B: - return ROCKSDB_NAMESPACE::Tickers::TXN_DUPLICATE_KEY_OVERHEAD; - case -0x0C: - return ROCKSDB_NAMESPACE::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD; - case -0x0D: - return ROCKSDB_NAMESPACE::Tickers::TXN_GET_TRY_AGAIN; - case -0x0E: - return ROCKSDB_NAMESPACE::Tickers::FILES_MARKED_TRASH; - case -0x0F: - return ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_IMMEDIATELY; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PREV; + case 0x7E: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_WRITTEN; + case 0x7F: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_READ; + case -0x1: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_WRITTEN; + case -0x2: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ; + case -0x3: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED; + case -0x4: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL; + case -0x5: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB; + case -0x6: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB_TTL; + case -0x7: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_WRITTEN; + case -0x8: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_READ; + case -0x9: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_SYNCED; + case -0xA: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_COUNT; + case -0xB: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_SIZE; + case -0xC: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_COUNT; + case -0xD: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_SIZE; + case -0xE: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_FILES; + case -0xF: + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_NEW_FILES; case -0x10: - return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_MARKED; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_FAILURES; case -0x11: - return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_PERIODIC; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_RELOCATED; case -0x12: - return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_TTL; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_RELOCATED; case -0x13: - return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_MARKED; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_FILES_EVICTED; case -0x14: - return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_KEYS_EVICTED; case -0x15: - return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_BYTES_EVICTED; case -0x16: - return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_MISS; case -0x17: - return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_HIT; case -0x18: - return ROCKSDB_NAMESPACE::Tickers:: - ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD; case -0x19: - return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD_FAILURES; case -0x1A: - return ROCKSDB_NAMESPACE::Tickers:: - ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_READ; case -0x1B: - return ROCKSDB_NAMESPACE::Tickers:: - ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_WRITE; case -0x1C: - return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH; + return ROCKSDB_NAMESPACE::Tickers::TXN_PREPARE_MUTEX_OVERHEAD; case -0x1D: - return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH; + return ROCKSDB_NAMESPACE::Tickers::TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD; case -0x1E: - return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_HITS; + return ROCKSDB_NAMESPACE::Tickers::TXN_DUPLICATE_KEY_OVERHEAD; case -0x1F: - return ROCKSDB_NAMESPACE::Tickers::VERIFY_CHECKSUM_READ_BYTES; + return ROCKSDB_NAMESPACE::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD; case -0x20: - return ROCKSDB_NAMESPACE::Tickers::BACKUP_READ_BYTES; + return ROCKSDB_NAMESPACE::Tickers::TXN_GET_TRY_AGAIN; case -0x21: - return ROCKSDB_NAMESPACE::Tickers::BACKUP_WRITE_BYTES; + return ROCKSDB_NAMESPACE::Tickers::FILES_MARKED_TRASH; case -0x22: - return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES; + return ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_FROM_TRASH_QUEUE; case -0x23: - return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES; + return ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_IMMEDIATELY; case -0x24: - return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES; + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT; case -0x25: - return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES; + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT; case -0x26: - return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES; + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT; case -0x27: - return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT; + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT; case -0x28: - return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT; + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT; case -0x29: - return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT; + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT; case -0x2A: - return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_BYTES; + return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH; case -0x2B: - return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_COUNT; + return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH; case -0x2C: - return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_BYTES; + return ROCKSDB_NAMESPACE::Tickers::VERIFY_CHECKSUM_READ_BYTES; case -0x2D: - return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_COUNT; + return ROCKSDB_NAMESPACE::Tickers::BACKUP_READ_BYTES; case -0x2E: - return ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_COMPUTE_COUNT; + return ROCKSDB_NAMESPACE::Tickers::BACKUP_WRITE_BYTES; case -0x2F: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_MISS; + return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES; case -0x30: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_HIT; + return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES; case -0x31: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD; + return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES; case -0x32: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD_FAILURES; + return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES; case -0x33: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_READ; + return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES; case -0x34: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_WRITE; + return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT; case -0x35: - return ROCKSDB_NAMESPACE::Tickers::READ_ASYNC_MICROS; + return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT; case -0x36: - return ROCKSDB_NAMESPACE::Tickers::ASYNC_READ_ERROR_COUNT; + return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT; case -0x37: - return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_FILTER_HITS; + return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_BYTES; case -0x38: - return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_INDEX_HITS; + return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_COUNT; case -0x39: - return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS; + return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_BYTES; case -0x3A: - return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS; + return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_COUNT; case -0x3B: - return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT; + return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_SEEK_FILTERED; case -0x3C: - return ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_MISMATCH_COUNT; + return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_SEEK_FILTER_MATCH; case -0x3D: - return ROCKSDB_NAMESPACE::Tickers::READAHEAD_TRIMMED; + return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_SEEK_DATA; case -0x3E: - return ROCKSDB_NAMESPACE::Tickers::FIFO_MAX_SIZE_COMPACTIONS; + return ROCKSDB_NAMESPACE::Tickers:: + LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER; case -0x3F: - return ROCKSDB_NAMESPACE::Tickers::FIFO_TTL_COMPACTIONS; + return ROCKSDB_NAMESPACE::Tickers:: + LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH; case -0x40: - return ROCKSDB_NAMESPACE::Tickers::PREFETCH_BYTES; + return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_SEEK_FILTERED; case -0x41: - return ROCKSDB_NAMESPACE::Tickers::PREFETCH_BYTES_USEFUL; + return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_SEEK_FILTER_MATCH; case -0x42: + return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_SEEK_DATA; + case -0x43: + return ROCKSDB_NAMESPACE::Tickers:: + NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER; + case -0x44: + return ROCKSDB_NAMESPACE::Tickers:: + NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH; + case -0x45: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_COMPUTE_COUNT; + case -0x46: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_MISMATCH_COUNT; + case -0x47: + return ROCKSDB_NAMESPACE::Tickers::MULTIGET_COROUTINE_COUNT; + case -0x48: + return ROCKSDB_NAMESPACE::Tickers::READ_ASYNC_MICROS; + case -0x49: + return ROCKSDB_NAMESPACE::Tickers::ASYNC_READ_ERROR_COUNT; + case -0x4A: + return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS; + case -0x4B: + return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT; + case -0x4C: + return ROCKSDB_NAMESPACE::Tickers::TIMESTAMP_FILTER_TABLE_CHECKED; + case -0x4D: + return ROCKSDB_NAMESPACE::Tickers::TIMESTAMP_FILTER_TABLE_FILTERED; + case -0x4E: + return ROCKSDB_NAMESPACE::Tickers::READAHEAD_TRIMMED; + case -0x4F: + return ROCKSDB_NAMESPACE::Tickers::FIFO_MAX_SIZE_COMPACTIONS; + case -0x50: + return ROCKSDB_NAMESPACE::Tickers::FIFO_TTL_COMPACTIONS; + case -0x51: + return ROCKSDB_NAMESPACE::Tickers::PREFETCH_BYTES; + case -0x52: + return ROCKSDB_NAMESPACE::Tickers::PREFETCH_BYTES_USEFUL; + case -0x53: return ROCKSDB_NAMESPACE::Tickers::PREFETCH_HITS; - case 0x5F: - // 0x5F was the max value in the initial copy of tickers to Java. - // Since these values are exposed directly to Java clients, we keep - // the value the same forever. + case -0x55: + return ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT; + case -0x54: + // -0x54 is the max value at this time. Since these values are exposed + // directly to Java clients, we'll keep the value the same till the next + // major release. // // TODO: This particular case seems confusing and unnecessary to pin the // value since it's meant to be the number of tickers, not an actual @@ -5582,124 +5758,131 @@ class HistogramTypeJni { return 0x1; case ROCKSDB_NAMESPACE::Histograms::COMPACTION_TIME: return 0x2; - case ROCKSDB_NAMESPACE::Histograms::SUBCOMPACTION_SETUP_TIME: + case ROCKSDB_NAMESPACE::Histograms::COMPACTION_CPU_TIME: return 0x3; - case ROCKSDB_NAMESPACE::Histograms::TABLE_SYNC_MICROS: + case ROCKSDB_NAMESPACE::Histograms::SUBCOMPACTION_SETUP_TIME: return 0x4; - case ROCKSDB_NAMESPACE::Histograms::COMPACTION_OUTFILE_SYNC_MICROS: + case ROCKSDB_NAMESPACE::Histograms::TABLE_SYNC_MICROS: return 0x5; - case ROCKSDB_NAMESPACE::Histograms::WAL_FILE_SYNC_MICROS: + case ROCKSDB_NAMESPACE::Histograms::COMPACTION_OUTFILE_SYNC_MICROS: return 0x6; - case ROCKSDB_NAMESPACE::Histograms::MANIFEST_FILE_SYNC_MICROS: + case ROCKSDB_NAMESPACE::Histograms::WAL_FILE_SYNC_MICROS: return 0x7; - case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_IO_MICROS: + case ROCKSDB_NAMESPACE::Histograms::MANIFEST_FILE_SYNC_MICROS: return 0x8; - case ROCKSDB_NAMESPACE::Histograms::DB_MULTIGET: + case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_IO_MICROS: return 0x9; - case ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_COMPACTION_MICROS: + case ROCKSDB_NAMESPACE::Histograms::DB_MULTIGET: return 0xA; - case ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_GET_MICROS: + case ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_COMPACTION_MICROS: return 0xB; - case ROCKSDB_NAMESPACE::Histograms::WRITE_RAW_BLOCK_MICROS: + case ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_GET_MICROS: return 0xC; + case ROCKSDB_NAMESPACE::Histograms::WRITE_RAW_BLOCK_MICROS: + return 0xD; case ROCKSDB_NAMESPACE::Histograms::NUM_FILES_IN_SINGLE_COMPACTION: - return 0x12; + return 0xE; case ROCKSDB_NAMESPACE::Histograms::DB_SEEK: - return 0x13; + return 0xF; case ROCKSDB_NAMESPACE::Histograms::WRITE_STALL: - return 0x14; + return 0x10; case ROCKSDB_NAMESPACE::Histograms::SST_READ_MICROS: + return 0x11; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_FLUSH_MICROS: + return 0x12; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_COMPACTION_MICROS: + return 0x13; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_OPEN_MICROS: + return 0x14; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_GET_MICROS: return 0x15; - case ROCKSDB_NAMESPACE::Histograms::NUM_SUBCOMPACTIONS_SCHEDULED: + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_MULTIGET_MICROS: return 0x16; - case ROCKSDB_NAMESPACE::Histograms::BYTES_PER_READ: + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_ITERATOR_MICROS: return 0x17; - case ROCKSDB_NAMESPACE::Histograms::BYTES_PER_WRITE: + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_VERIFY_DB_CHECKSUM_MICROS: return 0x18; - case ROCKSDB_NAMESPACE::Histograms::BYTES_PER_MULTIGET: + case ROCKSDB_NAMESPACE::Histograms:: + FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS: return 0x19; - case ROCKSDB_NAMESPACE::Histograms::BYTES_COMPRESSED: + case ROCKSDB_NAMESPACE::Histograms::SST_WRITE_MICROS: return 0x1A; - case ROCKSDB_NAMESPACE::Histograms::BYTES_DECOMPRESSED: + case ROCKSDB_NAMESPACE::Histograms::FILE_WRITE_FLUSH_MICROS: return 0x1B; - case ROCKSDB_NAMESPACE::Histograms::COMPRESSION_TIMES_NANOS: + case ROCKSDB_NAMESPACE::Histograms::FILE_WRITE_COMPACTION_MICROS: return 0x1C; - case ROCKSDB_NAMESPACE::Histograms::DECOMPRESSION_TIMES_NANOS: + case ROCKSDB_NAMESPACE::Histograms::FILE_WRITE_DB_OPEN_MICROS: return 0x1D; - case ROCKSDB_NAMESPACE::Histograms::READ_NUM_MERGE_OPERANDS: + case ROCKSDB_NAMESPACE::Histograms::NUM_SUBCOMPACTIONS_SCHEDULED: return 0x1E; - // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor - // version compatibility. - case ROCKSDB_NAMESPACE::Histograms::FLUSH_TIME: + case ROCKSDB_NAMESPACE::Histograms::BYTES_PER_READ: + return 0x1F; + case ROCKSDB_NAMESPACE::Histograms::BYTES_PER_WRITE: return 0x20; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_KEY_SIZE: + case ROCKSDB_NAMESPACE::Histograms::BYTES_PER_MULTIGET: return 0x21; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_VALUE_SIZE: + case ROCKSDB_NAMESPACE::Histograms::COMPRESSION_TIMES_NANOS: return 0x22; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_WRITE_MICROS: + case ROCKSDB_NAMESPACE::Histograms::DECOMPRESSION_TIMES_NANOS: return 0x23; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_GET_MICROS: + case ROCKSDB_NAMESPACE::Histograms::READ_NUM_MERGE_OPERANDS: return 0x24; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_MULTIGET_MICROS: + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_KEY_SIZE: return 0x25; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_SEEK_MICROS: + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_VALUE_SIZE: return 0x26; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_NEXT_MICROS: + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_WRITE_MICROS: return 0x27; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_PREV_MICROS: + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_GET_MICROS: return 0x28; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS: + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_MULTIGET_MICROS: return 0x29; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_READ_MICROS: + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_SEEK_MICROS: return 0x2A; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_SYNC_MICROS: + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_NEXT_MICROS: return 0x2B; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS: + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_PREV_MICROS: + return 0x2C; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS: return 0x2D; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS: + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_READ_MICROS: return 0x2E; - case ROCKSDB_NAMESPACE::Histograms:: - NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL: + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_SYNC_MICROS: return 0x2F; - case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL: + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS: + return 0x30; + case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS: return 0x31; - case ROCKSDB_NAMESPACE::Histograms::ERROR_HANDLER_AUTORESUME_RETRY_COUNT: + // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor + // version compatibility. + case ROCKSDB_NAMESPACE::Histograms::FLUSH_TIME: return 0x32; - case ROCKSDB_NAMESPACE::Histograms::ASYNC_READ_BYTES: + case ROCKSDB_NAMESPACE::Histograms::SST_BATCH_SIZE: return 0x33; - case ROCKSDB_NAMESPACE::Histograms::POLL_WAIT_MICROS: + case ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE: return 0x34; - case ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED: + case ROCKSDB_NAMESPACE::Histograms:: + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL: return 0x35; - case ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE: + case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL: return 0x36; - case NUM_LEVEL_READ_PER_MULTIGET: + case ROCKSDB_NAMESPACE::Histograms::NUM_LEVEL_READ_PER_MULTIGET: return 0x37; - case ASYNC_PREFETCH_ABORT_MICROS: + case ROCKSDB_NAMESPACE::Histograms::ERROR_HANDLER_AUTORESUME_RETRY_COUNT: return 0x38; - case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES: + case ROCKSDB_NAMESPACE::Histograms::ASYNC_READ_BYTES: return 0x39; - case ROCKSDB_NAMESPACE::Histograms::FILE_READ_FLUSH_MICROS: + case ROCKSDB_NAMESPACE::Histograms::POLL_WAIT_MICROS: return 0x3A; - case ROCKSDB_NAMESPACE::Histograms::FILE_READ_COMPACTION_MICROS: + case ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED: return 0x3B; - case ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_OPEN_MICROS: + case ASYNC_PREFETCH_ABORT_MICROS: return 0x3C; - case ROCKSDB_NAMESPACE::Histograms::FILE_READ_GET_MICROS: + case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES: return 0x3D; - case ROCKSDB_NAMESPACE::Histograms::FILE_READ_MULTIGET_MICROS: - return 0x3E; - case ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_ITERATOR_MICROS: - return 0x3F; - case ROCKSDB_NAMESPACE::Histograms::FILE_READ_VERIFY_DB_CHECKSUM_MICROS: - return 0x40; - case ROCKSDB_NAMESPACE::Histograms:: - FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS: - return 0x41; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: - // 0x1F for backwards compatibility on current minor version. - return 0x1F; - + // 0x3D for backwards compatibility on current minor version. + return 0x3E; default: // undefined/default return 0x0; @@ -5717,123 +5900,131 @@ class HistogramTypeJni { case 0x2: return ROCKSDB_NAMESPACE::Histograms::COMPACTION_TIME; case 0x3: - return ROCKSDB_NAMESPACE::Histograms::SUBCOMPACTION_SETUP_TIME; + return ROCKSDB_NAMESPACE::Histograms::COMPACTION_CPU_TIME; case 0x4: - return ROCKSDB_NAMESPACE::Histograms::TABLE_SYNC_MICROS; + return ROCKSDB_NAMESPACE::Histograms::SUBCOMPACTION_SETUP_TIME; case 0x5: - return ROCKSDB_NAMESPACE::Histograms::COMPACTION_OUTFILE_SYNC_MICROS; + return ROCKSDB_NAMESPACE::Histograms::TABLE_SYNC_MICROS; case 0x6: - return ROCKSDB_NAMESPACE::Histograms::WAL_FILE_SYNC_MICROS; + return ROCKSDB_NAMESPACE::Histograms::COMPACTION_OUTFILE_SYNC_MICROS; case 0x7: - return ROCKSDB_NAMESPACE::Histograms::MANIFEST_FILE_SYNC_MICROS; + return ROCKSDB_NAMESPACE::Histograms::WAL_FILE_SYNC_MICROS; case 0x8: - return ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_IO_MICROS; + return ROCKSDB_NAMESPACE::Histograms::MANIFEST_FILE_SYNC_MICROS; case 0x9: - return ROCKSDB_NAMESPACE::Histograms::DB_MULTIGET; + return ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_IO_MICROS; case 0xA: - return ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_COMPACTION_MICROS; + return ROCKSDB_NAMESPACE::Histograms::DB_MULTIGET; case 0xB: - return ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_GET_MICROS; + return ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_COMPACTION_MICROS; case 0xC: + return ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_GET_MICROS; + case 0xD: return ROCKSDB_NAMESPACE::Histograms::WRITE_RAW_BLOCK_MICROS; - case 0x12: + case 0xE: return ROCKSDB_NAMESPACE::Histograms::NUM_FILES_IN_SINGLE_COMPACTION; - case 0x13: + case 0xF: return ROCKSDB_NAMESPACE::Histograms::DB_SEEK; - case 0x14: + case 0x10: return ROCKSDB_NAMESPACE::Histograms::WRITE_STALL; - case 0x15: + case 0x11: return ROCKSDB_NAMESPACE::Histograms::SST_READ_MICROS; + case 0x12: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_FLUSH_MICROS; + case 0x13: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_COMPACTION_MICROS; + case 0x14: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_OPEN_MICROS; + case 0x15: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_GET_MICROS; case 0x16: - return ROCKSDB_NAMESPACE::Histograms::NUM_SUBCOMPACTIONS_SCHEDULED; + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_MULTIGET_MICROS; case 0x17: - return ROCKSDB_NAMESPACE::Histograms::BYTES_PER_READ; + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_ITERATOR_MICROS; case 0x18: - return ROCKSDB_NAMESPACE::Histograms::BYTES_PER_WRITE; + return ROCKSDB_NAMESPACE::Histograms:: + FILE_READ_VERIFY_DB_CHECKSUM_MICROS; case 0x19: - return ROCKSDB_NAMESPACE::Histograms::BYTES_PER_MULTIGET; + return ROCKSDB_NAMESPACE::Histograms:: + FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS; case 0x1A: - return ROCKSDB_NAMESPACE::Histograms::BYTES_COMPRESSED; + return ROCKSDB_NAMESPACE::Histograms::SST_WRITE_MICROS; case 0x1B: - return ROCKSDB_NAMESPACE::Histograms::BYTES_DECOMPRESSED; + return ROCKSDB_NAMESPACE::Histograms::FILE_WRITE_FLUSH_MICROS; case 0x1C: - return ROCKSDB_NAMESPACE::Histograms::COMPRESSION_TIMES_NANOS; + return ROCKSDB_NAMESPACE::Histograms::FILE_WRITE_COMPACTION_MICROS; case 0x1D: - return ROCKSDB_NAMESPACE::Histograms::DECOMPRESSION_TIMES_NANOS; + return ROCKSDB_NAMESPACE::Histograms::FILE_WRITE_DB_OPEN_MICROS; case 0x1E: - return ROCKSDB_NAMESPACE::Histograms::READ_NUM_MERGE_OPERANDS; - // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor - // version compatibility. + return ROCKSDB_NAMESPACE::Histograms::NUM_SUBCOMPACTIONS_SCHEDULED; + case 0x1F: + return ROCKSDB_NAMESPACE::Histograms::BYTES_PER_READ; case 0x20: - return ROCKSDB_NAMESPACE::Histograms::FLUSH_TIME; + return ROCKSDB_NAMESPACE::Histograms::BYTES_PER_WRITE; case 0x21: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_KEY_SIZE; + return ROCKSDB_NAMESPACE::Histograms::BYTES_PER_MULTIGET; case 0x22: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_VALUE_SIZE; + return ROCKSDB_NAMESPACE::Histograms::COMPRESSION_TIMES_NANOS; case 0x23: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_WRITE_MICROS; + return ROCKSDB_NAMESPACE::Histograms::DECOMPRESSION_TIMES_NANOS; case 0x24: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_GET_MICROS; + return ROCKSDB_NAMESPACE::Histograms::READ_NUM_MERGE_OPERANDS; case 0x25: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_MULTIGET_MICROS; + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_KEY_SIZE; case 0x26: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_SEEK_MICROS; + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_VALUE_SIZE; case 0x27: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_NEXT_MICROS; + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_WRITE_MICROS; case 0x28: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_PREV_MICROS; + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_GET_MICROS; case 0x29: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS; + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_MULTIGET_MICROS; case 0x2A: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_READ_MICROS; + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_SEEK_MICROS; case 0x2B: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_SYNC_MICROS; + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_NEXT_MICROS; + case 0x2C: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_PREV_MICROS; case 0x2D: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS; + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS; case 0x2E: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS; + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_READ_MICROS; case 0x2F: - return ROCKSDB_NAMESPACE::Histograms:: - NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL; + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_SYNC_MICROS; + case 0x30: + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS; case 0x31: - return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL; + return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS; + // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor + // version compatibility. case 0x32: - return ROCKSDB_NAMESPACE::Histograms:: - ERROR_HANDLER_AUTORESUME_RETRY_COUNT; + return ROCKSDB_NAMESPACE::Histograms::FLUSH_TIME; case 0x33: - return ROCKSDB_NAMESPACE::Histograms::ASYNC_READ_BYTES; + return ROCKSDB_NAMESPACE::Histograms::SST_BATCH_SIZE; case 0x34: - return ROCKSDB_NAMESPACE::Histograms::POLL_WAIT_MICROS; + return ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE; case 0x35: - return ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED; + return ROCKSDB_NAMESPACE::Histograms:: + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL; case 0x36: - return ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE; + return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL; case 0x37: return ROCKSDB_NAMESPACE::Histograms::NUM_LEVEL_READ_PER_MULTIGET; case 0x38: - return ROCKSDB_NAMESPACE::Histograms::ASYNC_PREFETCH_ABORT_MICROS; - case 0x39: return ROCKSDB_NAMESPACE::Histograms:: - TABLE_OPEN_PREFETCH_TAIL_READ_BYTES; + ERROR_HANDLER_AUTORESUME_RETRY_COUNT; + case 0x39: + return ROCKSDB_NAMESPACE::Histograms::ASYNC_READ_BYTES; case 0x3A: - return ROCKSDB_NAMESPACE::Histograms::FILE_READ_FLUSH_MICROS; + return ROCKSDB_NAMESPACE::Histograms::POLL_WAIT_MICROS; case 0x3B: - return ROCKSDB_NAMESPACE::Histograms::FILE_READ_COMPACTION_MICROS; + return ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED; case 0x3C: - return ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_OPEN_MICROS; + return ROCKSDB_NAMESPACE::Histograms::ASYNC_PREFETCH_ABORT_MICROS; case 0x3D: - return ROCKSDB_NAMESPACE::Histograms::FILE_READ_GET_MICROS; - case 0x3E: - return ROCKSDB_NAMESPACE::Histograms::FILE_READ_MULTIGET_MICROS; - case 0x3F: - return ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_ITERATOR_MICROS; - case 0x40: - return ROCKSDB_NAMESPACE::Histograms:: - FILE_READ_VERIFY_DB_CHECKSUM_MICROS; - case 0x41: return ROCKSDB_NAMESPACE::Histograms:: - FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS; - case 0x1F: + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES; + case 0x3E: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; @@ -7427,7 +7618,7 @@ class LiveFileMetaDataJni : public JavaClass { jmethodID mid = env->GetMethodID( jclazz, "", - "([BILjava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ)V"); + "([BILjava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ[B)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; @@ -7478,6 +7669,18 @@ class LiveFileMetaDataJni : public JavaClass { return nullptr; } + jbyteArray jfile_checksum = ROCKSDB_NAMESPACE::JniUtil::copyBytes( + env, live_file_meta_data->file_checksum); + if (env->ExceptionCheck()) { + // exception occurred creating java string + env->DeleteLocalRef(jcolumn_family_name); + env->DeleteLocalRef(jfile_name); + env->DeleteLocalRef(jpath); + env->DeleteLocalRef(jsmallest_key); + env->DeleteLocalRef(jlargest_key); + return nullptr; + } + jobject jlive_file_meta_data = env->NewObject( jclazz, mid, jcolumn_family_name, static_cast(live_file_meta_data->level), jfile_name, jpath, @@ -7488,7 +7691,7 @@ class LiveFileMetaDataJni : public JavaClass { static_cast(live_file_meta_data->num_reads_sampled), static_cast(live_file_meta_data->being_compacted), static_cast(live_file_meta_data->num_entries), - static_cast(live_file_meta_data->num_deletions)); + static_cast(live_file_meta_data->num_deletions), jfile_checksum); if (env->ExceptionCheck()) { env->DeleteLocalRef(jcolumn_family_name); @@ -7496,6 +7699,7 @@ class LiveFileMetaDataJni : public JavaClass { env->DeleteLocalRef(jpath); env->DeleteLocalRef(jsmallest_key); env->DeleteLocalRef(jlargest_key); + env->DeleteLocalRef(jfile_checksum); return nullptr; } @@ -7505,6 +7709,7 @@ class LiveFileMetaDataJni : public JavaClass { env->DeleteLocalRef(jpath); env->DeleteLocalRef(jsmallest_key); env->DeleteLocalRef(jlargest_key); + env->DeleteLocalRef(jfile_checksum); return jlive_file_meta_data; } @@ -7535,7 +7740,8 @@ class SstFileMetaDataJni : public JavaClass { } jmethodID mid = env->GetMethodID( - jclazz, "", "(Ljava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ)V"); + jclazz, "", + "(Ljava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ[B)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; @@ -7575,6 +7781,17 @@ class SstFileMetaDataJni : public JavaClass { return nullptr; } + jbyteArray jfile_checksum = ROCKSDB_NAMESPACE::JniUtil::copyBytes( + env, sst_file_meta_data->file_checksum); + if (env->ExceptionCheck()) { + // exception occurred creating java string + env->DeleteLocalRef(jfile_name); + env->DeleteLocalRef(jpath); + env->DeleteLocalRef(jsmallest_key); + env->DeleteLocalRef(jlargest_key); + return nullptr; + } + jobject jsst_file_meta_data = env->NewObject( jclazz, mid, jfile_name, jpath, static_cast(sst_file_meta_data->size), @@ -7583,13 +7800,14 @@ class SstFileMetaDataJni : public JavaClass { jlargest_key, static_cast(sst_file_meta_data->num_reads_sampled), static_cast(sst_file_meta_data->being_compacted), static_cast(sst_file_meta_data->num_entries), - static_cast(sst_file_meta_data->num_deletions)); + static_cast(sst_file_meta_data->num_deletions), jfile_checksum); if (env->ExceptionCheck()) { env->DeleteLocalRef(jfile_name); env->DeleteLocalRef(jpath); env->DeleteLocalRef(jsmallest_key); env->DeleteLocalRef(jlargest_key); + env->DeleteLocalRef(jfile_checksum); return nullptr; } @@ -7598,6 +7816,7 @@ class SstFileMetaDataJni : public JavaClass { env->DeleteLocalRef(jpath); env->DeleteLocalRef(jsmallest_key); env->DeleteLocalRef(jlargest_key); + env->DeleteLocalRef(jfile_checksum); return jsst_file_meta_data; } diff --git a/java/rocksjni/ratelimiterjni.cc b/java/rocksjni/ratelimiterjni.cc index 7a17f367e53..b22fa5931d8 100644 --- a/java/rocksjni/ratelimiterjni.cc +++ b/java/rocksjni/ratelimiterjni.cc @@ -36,9 +36,9 @@ jlong Java_org_rocksdb_RateLimiter_newRateLimiterHandle( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_RateLimiter_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_RateLimiter_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jhandle) { auto* handle = reinterpret_cast*>( jhandle); @@ -51,7 +51,7 @@ void Java_org_rocksdb_RateLimiter_disposeInternal(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_RateLimiter_setBytesPerSecond(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle, jlong jbytes_per_second) { reinterpret_cast*>(handle) @@ -65,7 +65,7 @@ void Java_org_rocksdb_RateLimiter_setBytesPerSecond(JNIEnv* /*env*/, * Signature: (J)J */ jlong Java_org_rocksdb_RateLimiter_getBytesPerSecond(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle) { return reinterpret_cast*>( handle) @@ -78,7 +78,7 @@ jlong Java_org_rocksdb_RateLimiter_getBytesPerSecond(JNIEnv* /*env*/, * Method: request * Signature: (JJ)V */ -void Java_org_rocksdb_RateLimiter_request(JNIEnv* /*env*/, jobject /*jobj*/, +void Java_org_rocksdb_RateLimiter_request(JNIEnv* /*env*/, jclass /*jcls*/, jlong handle, jlong jbytes) { reinterpret_cast*>(handle) ->get() @@ -91,7 +91,7 @@ void Java_org_rocksdb_RateLimiter_request(JNIEnv* /*env*/, jobject /*jobj*/, * Signature: (J)J */ jlong Java_org_rocksdb_RateLimiter_getSingleBurstBytes(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle) { return reinterpret_cast*>( handle) @@ -105,7 +105,7 @@ jlong Java_org_rocksdb_RateLimiter_getSingleBurstBytes(JNIEnv* /*env*/, * Signature: (J)J */ jlong Java_org_rocksdb_RateLimiter_getTotalBytesThrough(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle) { return reinterpret_cast*>( handle) @@ -119,7 +119,7 @@ jlong Java_org_rocksdb_RateLimiter_getTotalBytesThrough(JNIEnv* /*env*/, * Signature: (J)J */ jlong Java_org_rocksdb_RateLimiter_getTotalRequests(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle) { return reinterpret_cast*>( handle) diff --git a/java/rocksjni/restorejni.cc b/java/rocksjni/restorejni.cc index aadc861286b..6f03d880405 100644 --- a/java/rocksjni/restorejni.cc +++ b/java/rocksjni/restorejni.cc @@ -33,9 +33,9 @@ jlong Java_org_rocksdb_RestoreOptions_newRestoreOptions( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_RestoreOptions_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_RestoreOptions_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jobj*/, + jlong jhandle) { auto* ropt = reinterpret_cast(jhandle); assert(ropt); delete ropt; diff --git a/java/rocksjni/rocks_callback_object.cc b/java/rocksjni/rocks_callback_object.cc index 35513e1519b..7b2dbf5ec8f 100644 --- a/java/rocksjni/rocks_callback_object.cc +++ b/java/rocksjni/rocks_callback_object.cc @@ -17,7 +17,7 @@ * Signature: (J)V */ void Java_org_rocksdb_RocksCallbackObject_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle) { // TODO(AR) is deleting from the super class JniCallback OK, or must we delete // the subclass? Example hierarchies: diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc index 66eb2488b1b..abd2d58c8fd 100644 --- a/java/rocksjni/rocksjni.cc +++ b/java/rocksjni/rocksjni.cc @@ -26,6 +26,8 @@ #include "rocksdb/types.h" #include "rocksdb/version.h" #include "rocksjni/cplusplus_to_java_convert.h" +#include "rocksjni/jni_multiget_helpers.h" +#include "rocksjni/kv_helper.h" #include "rocksjni/portal.h" #ifdef min @@ -290,7 +292,8 @@ Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_String_ * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_disposeInternal(JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_RocksDB_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { auto* db = reinterpret_cast(jhandle); assert(db != nullptr); delete db; @@ -341,7 +344,7 @@ jobjectArray Java_org_rocksdb_RocksDB_listColumnFamilies(JNIEnv* env, jclass, * Method: createColumnFamily * Signature: (J[BIJ)J */ -jlong Java_org_rocksdb_RocksDB_createColumnFamily(JNIEnv* env, jobject, +jlong Java_org_rocksdb_RocksDB_createColumnFamily(JNIEnv* env, jclass, jlong jhandle, jbyteArray jcf_name, jint jcf_name_len, @@ -378,7 +381,7 @@ jlong Java_org_rocksdb_RocksDB_createColumnFamily(JNIEnv* env, jobject, * Signature: (JJ[[B)[J */ jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__JJ_3_3B( - JNIEnv* env, jobject, jlong jhandle, jlong jcf_options_handle, + JNIEnv* env, jclass, jlong jhandle, jlong jcf_options_handle, jobjectArray jcf_names) { auto* db = reinterpret_cast(jhandle); auto* cf_options = reinterpret_cast( @@ -419,7 +422,7 @@ jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__JJ_3_3B( * Signature: (J[J[[B)[J */ jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__J_3J_3_3B( - JNIEnv* env, jobject, jlong jhandle, jlongArray jcf_options_handles, + JNIEnv* env, jclass, jlong jhandle, jlongArray jcf_options_handles, jobjectArray jcf_names) { auto* db = reinterpret_cast(jhandle); const jsize jlen = env->GetArrayLength(jcf_options_handles); @@ -496,7 +499,7 @@ jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__J_3J_3_3B( * Signature: (J[BIJJ[J)J */ jlong Java_org_rocksdb_RocksDB_createColumnFamilyWithImport( - JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jcf_name, + JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jcf_name, jint jcf_name_len, jlong j_cf_options, jlong j_cf_import_options, jlongArray j_metadata_handle_array) { auto* db = reinterpret_cast(jdb_handle); @@ -552,7 +555,7 @@ jlong Java_org_rocksdb_RocksDB_createColumnFamilyWithImport( * Method: dropColumnFamily * Signature: (JJ)V; */ -void Java_org_rocksdb_RocksDB_dropColumnFamily(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_dropColumnFamily(JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle) { auto* db_handle = reinterpret_cast(jdb_handle); @@ -570,7 +573,7 @@ void Java_org_rocksdb_RocksDB_dropColumnFamily(JNIEnv* env, jobject, * Signature: (J[J)V */ void Java_org_rocksdb_RocksDB_dropColumnFamilies( - JNIEnv* env, jobject, jlong jdb_handle, jlongArray jcolumn_family_handles) { + JNIEnv* env, jclass, jlong jdb_handle, jlongArray jcolumn_family_handles) { auto* db_handle = reinterpret_cast(jdb_handle); std::vector cf_handles; @@ -600,61 +603,12 @@ void Java_org_rocksdb_RocksDB_dropColumnFamilies( ////////////////////////////////////////////////////////////////////////////// // ROCKSDB_NAMESPACE::DB::Put -/** - * @return true if the put succeeded, false if a Java Exception was thrown - */ -bool rocksdb_put_helper(JNIEnv* env, ROCKSDB_NAMESPACE::DB* db, - const ROCKSDB_NAMESPACE::WriteOptions& write_options, - ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len) { - jbyte* key = new jbyte[jkey_len]; - env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key); - if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - delete[] key; - return false; - } - - jbyte* value = new jbyte[jval_len]; - env->GetByteArrayRegion(jval, jval_off, jval_len, value); - if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - delete[] value; - delete[] key; - return false; - } - - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); - ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast(value), - jval_len); - - ROCKSDB_NAMESPACE::Status s; - if (cf_handle != nullptr) { - s = db->Put(write_options, cf_handle, key_slice, value_slice); - } else { - // backwards compatibility - s = db->Put(write_options, key_slice, value_slice); - } - - // cleanup - delete[] value; - delete[] key; - - if (s.ok()) { - return true; - } else { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); - return false; - } -} - /* * Class: org_rocksdb_RocksDB * Method: put * Signature: (J[BII[BII)V */ -void Java_org_rocksdb_RocksDB_put__J_3BII_3BII(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_put__J_3BII_3BII(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, @@ -662,8 +616,14 @@ void Java_org_rocksdb_RocksDB_put__J_3BII_3BII(JNIEnv* env, jobject, auto* db = reinterpret_cast(jdb_handle); static const ROCKSDB_NAMESPACE::WriteOptions default_write_options = ROCKSDB_NAMESPACE::WriteOptions(); - rocksdb_put_helper(env, db, default_write_options, nullptr, jkey, jkey_off, - jkey_len, jval, jval_off, jval_len); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, db->Put(default_write_options, key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } /* @@ -671,7 +631,7 @@ void Java_org_rocksdb_RocksDB_put__J_3BII_3BII(JNIEnv* env, jobject, * Method: put * Signature: (J[BII[BIIJ)V */ -void Java_org_rocksdb_RocksDB_put__J_3BII_3BIIJ(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_put__J_3BII_3BIIJ(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, @@ -682,13 +642,21 @@ void Java_org_rocksdb_RocksDB_put__J_3BII_3BIIJ(JNIEnv* env, jobject, ROCKSDB_NAMESPACE::WriteOptions(); auto* cf_handle = reinterpret_cast(jcf_handle); - if (cf_handle != nullptr) { - rocksdb_put_helper(env, db, default_write_options, cf_handle, jkey, - jkey_off, jkey_len, jval, jval_off, jval_len); - } else { + if (cf_handle == nullptr) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( "Invalid ColumnFamilyHandle.")); + return; + } + + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, + db->Put(default_write_options, cf_handle, key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; } } @@ -697,7 +665,7 @@ void Java_org_rocksdb_RocksDB_put__J_3BII_3BIIJ(JNIEnv* env, jobject, * Method: put * Signature: (JJ[BII[BII)V */ -void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BII(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BII(JNIEnv* env, jclass, jlong jdb_handle, jlong jwrite_options_handle, jbyteArray jkey, jint jkey_off, @@ -706,8 +674,15 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BII(JNIEnv* env, jobject, auto* db = reinterpret_cast(jdb_handle); auto* write_options = reinterpret_cast(jwrite_options_handle); - rocksdb_put_helper(env, db, *write_options, nullptr, jkey, jkey_off, jkey_len, - jval, jval_off, jval_len); + + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, db->Put(*write_options, key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } /* @@ -716,7 +691,7 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BII(JNIEnv* env, jobject, * Signature: (JJ[BII[BIIJ)V */ void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jwrite_options_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -724,13 +699,19 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BIIJ( reinterpret_cast(jwrite_options_handle); auto* cf_handle = reinterpret_cast(jcf_handle); - if (cf_handle != nullptr) { - rocksdb_put_helper(env, db, *write_options, cf_handle, jkey, jkey_off, - jkey_len, jval, jval_off, jval_len); - } else { + if (cf_handle == nullptr) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( "Invalid ColumnFamilyHandle.")); + return; + } + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, db->Put(*write_options, cf_handle, key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; } } @@ -740,7 +721,7 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BIIJ( * Signature: (JJLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJ)V */ void Java_org_rocksdb_RocksDB_putDirect( - JNIEnv* env, jobject /*jdb*/, jlong jdb_handle, jlong jwrite_options_handle, + JNIEnv* env, jclass /*jdb*/, jlong jdb_handle, jlong jwrite_options_handle, jobject jkey, jint jkey_off, jint jkey_len, jobject jval, jint jval_off, jint jval_len, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -809,7 +790,7 @@ bool rocksdb_delete_helper(JNIEnv* env, ROCKSDB_NAMESPACE::DB* db, * Method: delete * Signature: (J[BII)V */ -void Java_org_rocksdb_RocksDB_delete__J_3BII(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_delete__J_3BII(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jkey, jint jkey_off, jint jkey_len) { auto* db = reinterpret_cast(jdb_handle); @@ -824,7 +805,7 @@ void Java_org_rocksdb_RocksDB_delete__J_3BII(JNIEnv* env, jobject, * Method: delete * Signature: (J[BIIJ)V */ -void Java_org_rocksdb_RocksDB_delete__J_3BIIJ(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_delete__J_3BIIJ(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) { @@ -848,7 +829,7 @@ void Java_org_rocksdb_RocksDB_delete__J_3BIIJ(JNIEnv* env, jobject, * Method: delete * Signature: (JJ[BII)V */ -void Java_org_rocksdb_RocksDB_delete__JJ_3BII(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_delete__JJ_3BII(JNIEnv* env, jclass, jlong jdb_handle, jlong jwrite_options, jbyteArray jkey, jint jkey_off, @@ -866,7 +847,7 @@ void Java_org_rocksdb_RocksDB_delete__JJ_3BII(JNIEnv* env, jobject, * Signature: (JJ[BIIJ)V */ void Java_org_rocksdb_RocksDB_delete__JJ_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options, + JNIEnv* env, jclass, jlong jdb_handle, jlong jwrite_options, jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); auto* write_options = @@ -926,7 +907,7 @@ bool rocksdb_single_delete_helper( * Method: singleDelete * Signature: (J[BI)V */ -void Java_org_rocksdb_RocksDB_singleDelete__J_3BI(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_singleDelete__J_3BI(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jkey, jint jkey_len) { @@ -942,7 +923,7 @@ void Java_org_rocksdb_RocksDB_singleDelete__J_3BI(JNIEnv* env, jobject, * Method: singleDelete * Signature: (J[BIJ)V */ -void Java_org_rocksdb_RocksDB_singleDelete__J_3BIJ(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_singleDelete__J_3BIJ(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jkey, jint jkey_len, @@ -967,7 +948,7 @@ void Java_org_rocksdb_RocksDB_singleDelete__J_3BIJ(JNIEnv* env, jobject, * Method: singleDelete * Signature: (JJ[BIJ)V */ -void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BI(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BI(JNIEnv* env, jclass, jlong jdb_handle, jlong jwrite_options, jbyteArray jkey, @@ -985,7 +966,7 @@ void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BI(JNIEnv* env, jobject, * Signature: (JJ[BIJ)V */ void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BIJ( - JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options, + JNIEnv* env, jclass, jlong jdb_handle, jlong jwrite_options, jbyteArray jkey, jint jkey_len, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); auto* write_options = @@ -1062,7 +1043,7 @@ bool rocksdb_delete_range_helper( * Signature: (J[BII[BII)V */ void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BII( - JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jbegin_key, + JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len, jbyteArray jend_key, jint jend_key_off, jint jend_key_len) { auto* db = reinterpret_cast(jdb_handle); @@ -1073,100 +1054,13 @@ void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BII( jend_key, jend_key_off, jend_key_len); } -jint rocksdb_get_helper_direct( - JNIEnv* env, ROCKSDB_NAMESPACE::DB* db, - const ROCKSDB_NAMESPACE::ReadOptions& read_options, - ROCKSDB_NAMESPACE::ColumnFamilyHandle* column_family_handle, jobject jkey, - jint jkey_off, jint jkey_len, jobject jval, jint jval_off, jint jval_len, - bool* has_exception) { - static const int kNotFound = -1; - static const int kStatusError = -2; - static const int kArgumentError = -3; - - char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); - if (key == nullptr) { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, - "Invalid key argument (argument is not a valid direct ByteBuffer)"); - *has_exception = true; - return kArgumentError; - } - if (env->GetDirectBufferCapacity(jkey) < (jkey_off + jkey_len)) { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, - "Invalid key argument. Capacity is less than requested region (offset " - "+ length)."); - *has_exception = true; - return kArgumentError; - } - - char* value = reinterpret_cast(env->GetDirectBufferAddress(jval)); - if (value == nullptr) { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, - "Invalid value argument (argument is not a valid direct ByteBuffer)"); - *has_exception = true; - return kArgumentError; - } - - if (env->GetDirectBufferCapacity(jval) < (jval_off + jval_len)) { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, - "Invalid value argument. Capacity is less than requested region " - "(offset + length)."); - *has_exception = true; - return kArgumentError; - } - - key += jkey_off; - value += jval_off; - - ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len); - - ROCKSDB_NAMESPACE::PinnableSlice pinnable_value; - ROCKSDB_NAMESPACE::Status s; - if (column_family_handle != nullptr) { - s = db->Get(read_options, column_family_handle, key_slice, &pinnable_value); - } else { - // backwards compatibility - s = db->Get(read_options, db->DefaultColumnFamily(), key_slice, - &pinnable_value); - } - - if (s.IsNotFound()) { - *has_exception = false; - return kNotFound; - } else if (!s.ok()) { - *has_exception = true; - // Here since we are throwing a Java exception from c++ side. - // As a result, c++ does not know calling this function will in fact - // throwing an exception. As a result, the execution flow will - // not stop here, and codes after this throw will still be - // executed. - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); - - // Return a dummy const value to avoid compilation error, although - // java side might not have a chance to get the return value :) - return kStatusError; - } - - const jint pinnable_value_len = static_cast(pinnable_value.size()); - const jint length = std::min(jval_len, pinnable_value_len); - - memcpy(value, pinnable_value.data(), length); - pinnable_value.Reset(); - - *has_exception = false; - return pinnable_value_len; -} - /* * Class: org_rocksdb_RocksDB * Method: deleteRange * Signature: (J[BII[BIIJ)V */ void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jbegin_key, + JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len, jbyteArray jend_key, jint jend_key_off, jint jend_key_len, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -1191,7 +1085,7 @@ void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BIIJ( * Signature: (JJ[BII[BII)V */ void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BII( - JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options, + JNIEnv* env, jclass, jlong jdb_handle, jlong jwrite_options, jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len, jbyteArray jend_key, jint jend_key_off, jint jend_key_len) { auto* db = reinterpret_cast(jdb_handle); @@ -1208,7 +1102,7 @@ void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BII( * Signature: (JJ[BII[BIIJ)V */ void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options, + JNIEnv* env, jclass, jlong jdb_handle, jlong jwrite_options, jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len, jbyteArray jend_key, jint jend_key_off, jint jend_key_len, jlong jcf_handle) { @@ -1234,7 +1128,7 @@ void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BIIJ( * Signature: (JJ[BII[BII)V */ void Java_org_rocksdb_RocksDB_clipColumnFamily( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len, jbyteArray jend_key, jint jend_key_off, jint jend_key_len) { auto* db = reinterpret_cast(jdb_handle); @@ -1288,80 +1182,50 @@ void Java_org_rocksdb_RocksDB_clipColumnFamily( * Method: getDirect * Signature: (JJLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJ)I */ -jint Java_org_rocksdb_RocksDB_getDirect(JNIEnv* env, jobject /*jdb*/, +jint Java_org_rocksdb_RocksDB_getDirect(JNIEnv* env, jclass /*jdb*/, jlong jdb_handle, jlong jropt_handle, jobject jkey, jint jkey_off, jint jkey_len, jobject jval, jint jval_off, jint jval_len, jlong jcf_handle) { - auto* db_handle = reinterpret_cast(jdb_handle); + auto* db = reinterpret_cast(jdb_handle); auto* ro_opt = reinterpret_cast(jropt_handle); auto* cf_handle = reinterpret_cast(jcf_handle); - bool has_exception = false; - return rocksdb_get_helper_direct( - env, db_handle, - ro_opt == nullptr ? ROCKSDB_NAMESPACE::ReadOptions() : *ro_opt, cf_handle, - jkey, jkey_off, jkey_len, jval, jval_off, jval_len, &has_exception); -} - -////////////////////////////////////////////////////////////////////////////// -// ROCKSDB_NAMESPACE::DB::Merge - -/** - * @return true if the merge succeeded, false if a Java Exception was thrown - */ -bool rocksdb_merge_helper(JNIEnv* env, ROCKSDB_NAMESPACE::DB* db, - const ROCKSDB_NAMESPACE::WriteOptions& write_options, - ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len) { - jbyte* key = new jbyte[jkey_len]; - env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key); - if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - delete[] key; - return false; - } - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); - - jbyte* value = new jbyte[jval_len]; - env->GetByteArrayRegion(jval, jval_off, jval_len, value); - if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - delete[] value; - delete[] key; - return false; - } - ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast(value), - jval_len); - - ROCKSDB_NAMESPACE::Status s; - if (cf_handle != nullptr) { - s = db->Merge(write_options, cf_handle, key_slice, value_slice); - } else { - s = db->Merge(write_options, key_slice, value_slice); - } - // cleanup - delete[] value; - delete[] key; + try { + ROCKSDB_NAMESPACE::JDirectBufferSlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JDirectBufferPinnableSlice value(env, jval, jval_off, + jval_len); + ROCKSDB_NAMESPACE::Status s; + if (cf_handle != nullptr) { + s = db->Get( + ro_opt == nullptr ? ROCKSDB_NAMESPACE::ReadOptions() : *ro_opt, + cf_handle, key.slice(), &value.pinnable_slice()); + } else { + // backwards compatibility + s = db->Get( + ro_opt == nullptr ? ROCKSDB_NAMESPACE::ReadOptions() : *ro_opt, + db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice()); + } - if (s.ok()) { - return true; + ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s); + return value.Fetch(); + } catch (ROCKSDB_NAMESPACE::KVException& e) { + return e.Code(); } - - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); - return false; } +////////////////////////////////////////////////////////////////////////////// +// ROCKSDB_NAMESPACE::DB::Merge + /* * Class: org_rocksdb_RocksDB * Method: merge * Signature: (J[BII[BII)V */ -void Java_org_rocksdb_RocksDB_merge__J_3BII_3BII(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_merge__J_3BII_3BII(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, @@ -1369,8 +1233,14 @@ void Java_org_rocksdb_RocksDB_merge__J_3BII_3BII(JNIEnv* env, jobject, auto* db = reinterpret_cast(jdb_handle); static const ROCKSDB_NAMESPACE::WriteOptions default_write_options = ROCKSDB_NAMESPACE::WriteOptions(); - rocksdb_merge_helper(env, db, default_write_options, nullptr, jkey, jkey_off, - jkey_len, jval, jval_off, jval_len); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, db->Merge(default_write_options, key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } /* @@ -1379,7 +1249,7 @@ void Java_org_rocksdb_RocksDB_merge__J_3BII_3BII(JNIEnv* env, jobject, * Signature: (J[BII[BIIJ)V */ void Java_org_rocksdb_RocksDB_merge__J_3BII_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jkey, jint jkey_off, + JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -1388,8 +1258,15 @@ void Java_org_rocksdb_RocksDB_merge__J_3BII_3BIIJ( auto* cf_handle = reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { - rocksdb_merge_helper(env, db, default_write_options, cf_handle, jkey, - jkey_off, jkey_len, jval, jval_off, jval_len); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, db->Merge(default_write_options, cf_handle, key.slice(), + value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -1403,14 +1280,20 @@ void Java_org_rocksdb_RocksDB_merge__J_3BII_3BIIJ( * Signature: (JJ[BII[BII)V */ void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BII( - JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jwrite_options_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len) { auto* db = reinterpret_cast(jdb_handle); auto* write_options = reinterpret_cast(jwrite_options_handle); - rocksdb_merge_helper(env, db, *write_options, nullptr, jkey, jkey_off, - jkey_len, jval, jval_off, jval_len); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, db->Merge(*write_options, key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } /* @@ -1419,7 +1302,7 @@ void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BII( * Signature: (JJ[BII[BIIJ)V */ void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jwrite_options_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -1428,8 +1311,15 @@ void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BIIJ( auto* cf_handle = reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { - rocksdb_merge_helper(env, db, *write_options, cf_handle, jkey, jkey_off, - jkey_len, jval, jval_off, jval_len); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, + db->Merge(*write_options, cf_handle, key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -1437,16 +1327,37 @@ void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BIIJ( } } -jlong rocksdb_iterator_helper( - ROCKSDB_NAMESPACE::DB* db, ROCKSDB_NAMESPACE::ReadOptions read_options, - ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle) { - ROCKSDB_NAMESPACE::Iterator* iterator = nullptr; - if (cf_handle != nullptr) { - iterator = db->NewIterator(read_options, cf_handle); - } else { - iterator = db->NewIterator(read_options); - } - return GET_CPLUSPLUS_POINTER(iterator); +/* + * Class: org_rocksdb_RocksDB + * Method: mergeDirect + * Signature: (JJLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJ)V + */ +void Java_org_rocksdb_RocksDB_mergeDirect( + JNIEnv* env, jclass /*jdb*/, jlong jdb_handle, jlong jwrite_options_handle, + jobject jkey, jint jkey_off, jint jkey_len, jobject jval, jint jval_off, + jint jval_len, jlong jcf_handle) { + auto* db = reinterpret_cast(jdb_handle); + auto* write_options = + reinterpret_cast(jwrite_options_handle); + auto* cf_handle = + reinterpret_cast(jcf_handle); + + auto merge = [&env, &db, &cf_handle, &write_options]( + ROCKSDB_NAMESPACE::Slice& key, + ROCKSDB_NAMESPACE::Slice& value) { + ROCKSDB_NAMESPACE::Status s; + if (cf_handle == nullptr) { + s = db->Merge(*write_options, key, value); + } else { + s = db->Merge(*write_options, cf_handle, key, value); + } + if (s.ok()) { + return; + } + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); + }; + ROCKSDB_NAMESPACE::JniUtil::kv_op_direct(merge, env, jkey, jkey_off, jkey_len, + jval, jval_off, jval_len); } /* @@ -1454,7 +1365,7 @@ jlong rocksdb_iterator_helper( * Method: deleteDirect * Signature: (JJLjava/nio/ByteBuffer;IIJ)V */ -void Java_org_rocksdb_RocksDB_deleteDirect(JNIEnv* env, jobject /*jdb*/, +void Java_org_rocksdb_RocksDB_deleteDirect(JNIEnv* env, jclass /*jdb*/, jlong jdb_handle, jlong jwrite_options, jobject jkey, jint jkey_offset, jint jkey_len, @@ -1488,7 +1399,7 @@ void Java_org_rocksdb_RocksDB_deleteDirect(JNIEnv* env, jobject /*jdb*/, * Method: write0 * Signature: (JJJ)V */ -void Java_org_rocksdb_RocksDB_write0(JNIEnv* env, jobject, jlong jdb_handle, +void Java_org_rocksdb_RocksDB_write0(JNIEnv* env, jclass, jlong jdb_handle, jlong jwrite_options_handle, jlong jwb_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -1508,7 +1419,7 @@ void Java_org_rocksdb_RocksDB_write0(JNIEnv* env, jobject, jlong jdb_handle, * Method: write1 * Signature: (JJJ)V */ -void Java_org_rocksdb_RocksDB_write1(JNIEnv* env, jobject, jlong jdb_handle, +void Java_org_rocksdb_RocksDB_write1(JNIEnv* env, jclass, jlong jdb_handle, jlong jwrite_options_handle, jlong jwbwi_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -1528,64 +1439,29 @@ void Java_org_rocksdb_RocksDB_write1(JNIEnv* env, jobject, jlong jdb_handle, ////////////////////////////////////////////////////////////////////////////// // ROCKSDB_NAMESPACE::DB::Get -jbyteArray rocksdb_get_helper( - JNIEnv* env, ROCKSDB_NAMESPACE::DB* db, - const ROCKSDB_NAMESPACE::ReadOptions& read_opt, - ROCKSDB_NAMESPACE::ColumnFamilyHandle* column_family_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len) { - jbyte* key = new jbyte[jkey_len]; - env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key); - if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - delete[] key; - return nullptr; - } - - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); - - ROCKSDB_NAMESPACE::PinnableSlice pinnable_value; - ROCKSDB_NAMESPACE::Status s; - if (column_family_handle != nullptr) { - s = db->Get(read_opt, column_family_handle, key_slice, &pinnable_value); - } else { - s = db->Get(read_opt, db->DefaultColumnFamily(), key_slice, - &pinnable_value); - } - - // cleanup - delete[] key; - - if (s.IsNotFound()) { - return nullptr; - } - - if (s.ok()) { - jbyteArray jret_value = - ROCKSDB_NAMESPACE::JniUtil::copyBytes(env, pinnable_value); - pinnable_value.Reset(); - if (jret_value == nullptr) { - // exception occurred - return nullptr; - } - return jret_value; - } - - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); - return nullptr; -} - /* * Class: org_rocksdb_RocksDB * Method: get * Signature: (J[BII)[B */ -jbyteArray Java_org_rocksdb_RocksDB_get__J_3BII(JNIEnv* env, jobject, +jbyteArray Java_org_rocksdb_RocksDB_get__J_3BII(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jkey, jint jkey_off, jint jkey_len) { - return rocksdb_get_helper( - env, reinterpret_cast(jdb_handle), - ROCKSDB_NAMESPACE::ReadOptions(), nullptr, jkey, jkey_off, jkey_len); + auto* db = reinterpret_cast(jdb_handle); + + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, + db->Get(ROCKSDB_NAMESPACE::ReadOptions(), db->DefaultColumnFamily(), + key.slice(), &value.pinnable_slice())); + return value.NewByteArray(); + + } catch (ROCKSDB_NAMESPACE::KVException&) { + return nullptr; + } } /* @@ -1593,21 +1469,27 @@ jbyteArray Java_org_rocksdb_RocksDB_get__J_3BII(JNIEnv* env, jobject, * Method: get * Signature: (J[BIIJ)[B */ -jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIIJ(JNIEnv* env, jobject, +jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIIJ(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) { - auto db_handle = reinterpret_cast(jdb_handle); - auto cf_handle = - reinterpret_cast(jcf_handle); - if (cf_handle != nullptr) { - return rocksdb_get_helper(env, db_handle, ROCKSDB_NAMESPACE::ReadOptions(), - cf_handle, jkey, jkey_off, jkey_len); - } else { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, ROCKSDB_NAMESPACE::Status::InvalidArgument( - "Invalid ColumnFamilyHandle.")); + auto* db = reinterpret_cast(jdb_handle); + auto cf_handle = ROCKSDB_NAMESPACE::ColumnFamilyJNIHelpers::handleFromJLong( + env, jcf_handle); + if (cf_handle == nullptr) { + return nullptr; + } + + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(), + &value.pinnable_slice())); + return value.NewByteArray(); + + } catch (ROCKSDB_NAMESPACE::KVException&) { return nullptr; } } @@ -1617,15 +1499,25 @@ jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIIJ(JNIEnv* env, jobject, * Method: get * Signature: (JJ[BII)[B */ -jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BII(JNIEnv* env, jobject, +jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BII(JNIEnv* env, jclass, jlong jdb_handle, jlong jropt_handle, jbyteArray jkey, jint jkey_off, jint jkey_len) { - return rocksdb_get_helper( - env, reinterpret_cast(jdb_handle), - *reinterpret_cast(jropt_handle), nullptr, - jkey, jkey_off, jkey_len); + auto* db = reinterpret_cast(jdb_handle); + + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, + db->Get( + *reinterpret_cast(jropt_handle), + db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice())); + return value.NewByteArray(); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return nullptr; + } } /* @@ -1634,87 +1526,26 @@ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BII(JNIEnv* env, jobject, * Signature: (JJ[BIIJ)[B */ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle, jbyteArray jkey, + JNIEnv* env, jclass, jlong jdb_handle, jlong jropt_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) { - auto* db_handle = reinterpret_cast(jdb_handle); - auto& ro_opt = - *reinterpret_cast(jropt_handle); - auto* cf_handle = - reinterpret_cast(jcf_handle); - if (cf_handle != nullptr) { - return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, jkey, jkey_off, - jkey_len); - } else { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, ROCKSDB_NAMESPACE::Status::InvalidArgument( - "Invalid ColumnFamilyHandle.")); + auto* db = reinterpret_cast(jdb_handle); + auto cf_handle = ROCKSDB_NAMESPACE::ColumnFamilyJNIHelpers::handleFromJLong( + env, jcf_handle); + if (cf_handle == nullptr) { return nullptr; } -} - -jint rocksdb_get_helper( - JNIEnv* env, ROCKSDB_NAMESPACE::DB* db, - const ROCKSDB_NAMESPACE::ReadOptions& read_options, - ROCKSDB_NAMESPACE::ColumnFamilyHandle* column_family_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, - jint jval_off, jint jval_len, bool* has_exception) { - static const int kNotFound = -1; - static const int kStatusError = -2; - - jbyte* key = new jbyte[jkey_len]; - env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key); - if (env->ExceptionCheck()) { - // exception thrown: OutOfMemoryError - delete[] key; - *has_exception = true; - return kStatusError; - } - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); - - ROCKSDB_NAMESPACE::PinnableSlice pinnable_value; - ROCKSDB_NAMESPACE::Status s; - if (column_family_handle != nullptr) { - s = db->Get(read_options, column_family_handle, key_slice, &pinnable_value); - } else { - s = db->Get(read_options, db->DefaultColumnFamily(), key_slice, - &pinnable_value); - } - - // cleanup - delete[] key; - - if (s.IsNotFound()) { - *has_exception = false; - return kNotFound; - } else if (!s.ok()) { - *has_exception = true; - // Here since we are throwing a Java exception from c++ side. - // As a result, c++ does not know calling this function will in fact - // throwing an exception. As a result, the execution flow will - // not stop here, and codes after this throw will still be - // executed. - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); - - // Return a dummy const value to avoid compilation error, although - // java side might not have a chance to get the return value :) - return kStatusError; - } - const jint pinnable_value_len = static_cast(pinnable_value.size()); - const jint length = std::min(jval_len, pinnable_value_len); - - env->SetByteArrayRegion(jval, jval_off, length, - const_cast(reinterpret_cast( - pinnable_value.data()))); - pinnable_value.Reset(); - if (env->ExceptionCheck()) { - // exception thrown: OutOfMemoryError - *has_exception = true; - return kStatusError; + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, db->Get(*reinterpret_cast( + jropt_handle), + cf_handle, key.slice(), &value.pinnable_slice())); + return value.NewByteArray(); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return nullptr; } - - *has_exception = false; - return pinnable_value_len; } /* @@ -1722,16 +1553,25 @@ jint rocksdb_get_helper( * Method: get * Signature: (J[BII[BII)I */ -jint Java_org_rocksdb_RocksDB_get__J_3BII_3BII(JNIEnv* env, jobject, +jint Java_org_rocksdb_RocksDB_get__J_3BII_3BII(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len) { - bool has_exception = false; - return rocksdb_get_helper( - env, reinterpret_cast(jdb_handle), - ROCKSDB_NAMESPACE::ReadOptions(), nullptr, jkey, jkey_off, jkey_len, jval, - jval_off, jval_len, &has_exception); + auto* db = reinterpret_cast(jdb_handle); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off, + jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, + db->Get(ROCKSDB_NAMESPACE::ReadOptions(), db->DefaultColumnFamily(), + key.slice(), &value.pinnable_slice())); + return value.Fetch(); + + } catch (ROCKSDB_NAMESPACE::KVException& e) { + return e.Code(); + } } /* @@ -1739,26 +1579,29 @@ jint Java_org_rocksdb_RocksDB_get__J_3BII_3BII(JNIEnv* env, jobject, * Method: get * Signature: (J[BII[BIIJ)I */ -jint Java_org_rocksdb_RocksDB_get__J_3BII_3BIIJ(JNIEnv* env, jobject, +jint Java_org_rocksdb_RocksDB_get__J_3BII_3BIIJ(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len, jlong jcf_handle) { - auto* db_handle = reinterpret_cast(jdb_handle); - auto* cf_handle = - reinterpret_cast(jcf_handle); - if (cf_handle != nullptr) { - bool has_exception = false; - return rocksdb_get_helper(env, db_handle, ROCKSDB_NAMESPACE::ReadOptions(), - cf_handle, jkey, jkey_off, jkey_len, jval, - jval_off, jval_len, &has_exception); - } else { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, ROCKSDB_NAMESPACE::Status::InvalidArgument( - "Invalid ColumnFamilyHandle.")); - // will never be evaluated - return 0; + auto* db = reinterpret_cast(jdb_handle); + auto cf_handle = ROCKSDB_NAMESPACE::ColumnFamilyJNIHelpers::handleFromJLong( + env, jcf_handle); + if (cf_handle == nullptr) { + return ROCKSDB_NAMESPACE::KVException::kStatusError; + } + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off, + jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(), + &value.pinnable_slice())); + return value.Fetch(); + + } catch (ROCKSDB_NAMESPACE::KVException& e) { + return e.Code(); } } @@ -1767,17 +1610,27 @@ jint Java_org_rocksdb_RocksDB_get__J_3BII_3BIIJ(JNIEnv* env, jobject, * Method: get * Signature: (JJ[BII[BII)I */ -jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BII(JNIEnv* env, jobject, +jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BII(JNIEnv* env, jclass, jlong jdb_handle, jlong jropt_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len) { - bool has_exception = false; - return rocksdb_get_helper( - env, reinterpret_cast(jdb_handle), - *reinterpret_cast(jropt_handle), nullptr, - jkey, jkey_off, jkey_len, jval, jval_off, jval_len, &has_exception); + auto* db = reinterpret_cast(jdb_handle); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off, + jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, + db->Get( + *reinterpret_cast(jropt_handle), + db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice())); + return value.Fetch(); + + } catch (ROCKSDB_NAMESPACE::KVException& e) { + return e.Code(); + } } /* @@ -1786,273 +1639,28 @@ jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BII(JNIEnv* env, jobject, * Signature: (JJ[BII[BIIJ)I */ jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle, jbyteArray jkey, + JNIEnv* env, jclass, jlong jdb_handle, jlong jropt_handle, jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len, jlong jcf_handle) { - auto* db_handle = reinterpret_cast(jdb_handle); - auto& ro_opt = - *reinterpret_cast(jropt_handle); - auto* cf_handle = - reinterpret_cast(jcf_handle); - if (cf_handle != nullptr) { - bool has_exception = false; - return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, jkey, jkey_off, - jkey_len, jval, jval_off, jval_len, - &has_exception); - } else { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, ROCKSDB_NAMESPACE::Status::InvalidArgument( - "Invalid ColumnFamilyHandle.")); - // will never be evaluated - return 0; - } -} - -inline void multi_get_helper_release_keys(std::vector& keys_to_free) { - auto end = keys_to_free.end(); - for (auto it = keys_to_free.begin(); it != end; ++it) { - delete[] * it; - } - keys_to_free.clear(); -} - -/** - * @brief fill a native array of cf handles from java handles - * - * @param env - * @param cf_handles to fill from the java variants - * @param jcolumn_family_handles - * @return true if the copy succeeds - * @return false if a JNI exception is generated - */ -inline bool cf_handles_from_jcf_handles( - JNIEnv* env, - std::vector& cf_handles, - jlongArray jcolumn_family_handles) { - if (jcolumn_family_handles != nullptr) { - const jsize len_cols = env->GetArrayLength(jcolumn_family_handles); - - jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr); - if (jcfh == nullptr) { - // exception thrown: OutOfMemoryError - jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); - (env)->ThrowNew(exception_cls, - "Insufficient Memory for CF handle array."); - return false; - } - - for (jsize i = 0; i < len_cols; i++) { - auto* cf_handle = - reinterpret_cast(jcfh[i]); - cf_handles.push_back(cf_handle); - } - env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT); - } - return true; -} - -/** - * @brief copy keys from JNI into vector of slices for Rocks API - * - * @param keys to instantiate - * @param jkeys - * @param jkey_offs - * @param jkey_lens - * @return true if the copy succeeds - * @return false if a JNI exception is raised - */ -inline bool keys_from_jkeys(JNIEnv* env, - std::vector& keys, - std::vector& keys_to_free, - jobjectArray jkeys, jintArray jkey_offs, - jintArray jkey_lens) { - jint* jkey_off = env->GetIntArrayElements(jkey_offs, nullptr); - if (jkey_off == nullptr) { - // exception thrown: OutOfMemoryError - jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); - (env)->ThrowNew(exception_cls, "Insufficient Memory for key offset array."); - return false; - } - - jint* jkey_len = env->GetIntArrayElements(jkey_lens, nullptr); - if (jkey_len == nullptr) { - // exception thrown: OutOfMemoryError - env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); - jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); - (env)->ThrowNew(exception_cls, "Insufficient Memory for key length array."); - return false; - } - - const jsize len_keys = env->GetArrayLength(jkeys); - for (jsize i = 0; i < len_keys; i++) { - jobject jkey = env->GetObjectArrayElement(jkeys, i); - if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT); - env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); - multi_get_helper_release_keys(keys_to_free); - jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); - (env)->ThrowNew(exception_cls, - "Insufficient Memory for key object array."); - return false; - } - - jbyteArray jkey_ba = reinterpret_cast(jkey); - - const jint len_key = jkey_len[i]; - jbyte* key = new jbyte[len_key]; - env->GetByteArrayRegion(jkey_ba, jkey_off[i], len_key, key); - if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - delete[] key; - env->DeleteLocalRef(jkey); - env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT); - env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); - multi_get_helper_release_keys(keys_to_free); - jclass exception_cls = - (env)->FindClass("java/lang/ArrayIndexOutOfBoundsException"); - (env)->ThrowNew(exception_cls, "Invalid byte array region index."); - return false; - } - - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), len_key); - keys.push_back(key_slice); - - env->DeleteLocalRef(jkey); - keys_to_free.push_back(key); - } - - // cleanup jkey_off and jken_len - env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT); - env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); - - return true; -} - -inline bool keys_from_bytebuffers(JNIEnv* env, - std::vector& keys, - jobjectArray jkeys, jintArray jkey_offs, - jintArray jkey_lens) { - jint* jkey_off = env->GetIntArrayElements(jkey_offs, nullptr); - if (jkey_off == nullptr) { - // exception thrown: OutOfMemoryError - jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); - (env)->ThrowNew(exception_cls, "Insufficient Memory for key offset array."); - return false; - } - - jint* jkey_len = env->GetIntArrayElements(jkey_lens, nullptr); - if (jkey_len == nullptr) { - // exception thrown: OutOfMemoryError - env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); - jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); - (env)->ThrowNew(exception_cls, "Insufficient Memory for key length array."); - return false; - } - - const jsize len_keys = env->GetArrayLength(jkeys); - for (jsize i = 0; i < len_keys; i++) { - jobject jkey = env->GetObjectArrayElement(jkeys, i); - if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - // cleanup jkey_off and jkey_len - env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT); - env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); - - return false; - } - char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); - ROCKSDB_NAMESPACE::Slice key_slice(key + jkey_off[i], jkey_len[i]); - keys.push_back(key_slice); - - env->DeleteLocalRef(jkey); - } - - // cleanup jkey_off and jkey_len - env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT); - env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); - - return true; -} - -/** - * cf multi get - * - * @return byte[][] of values or nullptr if an - * exception occurs - */ -jobjectArray multi_get_helper(JNIEnv* env, jobject, ROCKSDB_NAMESPACE::DB* db, - const ROCKSDB_NAMESPACE::ReadOptions& rOpt, - jobjectArray jkeys, jintArray jkey_offs, - jintArray jkey_lens, - jlongArray jcolumn_family_handles) { - std::vector cf_handles; - if (!cf_handles_from_jcf_handles(env, cf_handles, jcolumn_family_handles)) { - return nullptr; - } - - std::vector keys; - std::vector keys_to_free; - if (!keys_from_jkeys(env, keys, keys_to_free, jkeys, jkey_offs, jkey_lens)) { - return nullptr; - } - - std::vector values; - std::vector s; - if (cf_handles.size() == 0) { - s = db->MultiGet(rOpt, keys, &values); - } else { - s = db->MultiGet(rOpt, cf_handles, keys, &values); + auto* db = reinterpret_cast(jdb_handle); + auto cf_handle = ROCKSDB_NAMESPACE::ColumnFamilyJNIHelpers::handleFromJLong( + env, jcf_handle); + if (cf_handle == nullptr) { + return ROCKSDB_NAMESPACE::KVException::kStatusError; } + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off, + jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, db->Get(*reinterpret_cast( + jropt_handle), + cf_handle, key.slice(), &value.pinnable_slice())); + return value.Fetch(); - // free up allocated byte arrays - multi_get_helper_release_keys(keys_to_free); - - // prepare the results - jobjectArray jresults = ROCKSDB_NAMESPACE::ByteJni::new2dByteArray( - env, static_cast(s.size())); - if (jresults == nullptr) { - // exception occurred - jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); - (env)->ThrowNew(exception_cls, "Insufficient Memory for results."); - return nullptr; + } catch (ROCKSDB_NAMESPACE::KVException& e) { + return e.Code(); } - - // add to the jresults - for (std::vector::size_type i = 0; i != s.size(); - i++) { - if (s[i].ok()) { - std::string* value = &values[i]; - const jsize jvalue_len = static_cast(value->size()); - jbyteArray jentry_value = env->NewByteArray(jvalue_len); - if (jentry_value == nullptr) { - // exception thrown: OutOfMemoryError - return nullptr; - } - - env->SetByteArrayRegion( - jentry_value, 0, static_cast(jvalue_len), - const_cast(reinterpret_cast(value->c_str()))); - if (env->ExceptionCheck()) { - // exception thrown: - // ArrayIndexOutOfBoundsException - env->DeleteLocalRef(jentry_value); - return nullptr; - } - - env->SetObjectArrayElement(jresults, static_cast(i), jentry_value); - if (env->ExceptionCheck()) { - // exception thrown: - // ArrayIndexOutOfBoundsException - env->DeleteLocalRef(jentry_value); - return nullptr; - } - - env->DeleteLocalRef(jentry_value); - } - } - - return jresults; } /** @@ -2062,192 +1670,152 @@ jobjectArray multi_get_helper(JNIEnv* env, jobject, ROCKSDB_NAMESPACE::DB* db, * exception on a problem */ -/** - * @brief multi_get_helper_direct for fast-path multiget (io_uring) on Linux - * - * @param env - * @param db - * @param rOpt read options - * @param jcolumn_family_handles 0, 1, or n column family handles - * @param jkeys - * @param jkey_offsets - * @param jkey_lengths - * @param jvalues byte buffers to receive values - * @param jvalue_sizes returned actual sizes of data values for keys - * @param jstatuses returned java RocksDB status values for per key - */ -void multi_get_helper_direct(JNIEnv* env, jobject, ROCKSDB_NAMESPACE::DB* db, - const ROCKSDB_NAMESPACE::ReadOptions& rOpt, - jlongArray jcolumn_family_handles, - jobjectArray jkeys, jintArray jkey_offsets, - jintArray jkey_lengths, jobjectArray jvalues, - jintArray jvalue_sizes, jobjectArray jstatuses) { - const jsize num_keys = env->GetArrayLength(jkeys); - - std::vector keys; - if (!keys_from_bytebuffers(env, keys, jkeys, jkey_offsets, jkey_lengths)) { - return; - } - - std::vector values(num_keys); - - std::vector cf_handles; - if (!cf_handles_from_jcf_handles(env, cf_handles, jcolumn_family_handles)) { - return; - } - - std::vector s(num_keys); - if (cf_handles.size() == 0) { - // we can use the more efficient call here - auto cf_handle = db->DefaultColumnFamily(); - db->MultiGet(rOpt, cf_handle, num_keys, keys.data(), values.data(), - s.data()); - } else if (cf_handles.size() == 1) { - // we can use the more efficient call here - auto cf_handle = cf_handles[0]; - db->MultiGet(rOpt, cf_handle, num_keys, keys.data(), values.data(), - s.data()); - } else { - // multiple CFs version - db->MultiGet(rOpt, num_keys, cf_handles.data(), keys.data(), values.data(), - s.data()); - } - - // prepare the results - jobjectArray jresults = ROCKSDB_NAMESPACE::ByteJni::new2dByteArray( - env, static_cast(s.size())); - if (jresults == nullptr) { - // exception occurred - jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); - (env)->ThrowNew(exception_cls, "Insufficient Memory for results."); - return; - } - - std::vector value_size; - for (int i = 0; i < num_keys; i++) { - auto jstatus = ROCKSDB_NAMESPACE::StatusJni::construct(env, s[i]); - if (jstatus == nullptr) { - // exception in context - return; - } - env->SetObjectArrayElement(jstatuses, i, jstatus); - - if (s[i].ok()) { - jobject jvalue_bytebuf = env->GetObjectArrayElement(jvalues, i); - if (env->ExceptionCheck()) { - // ArrayIndexOutOfBoundsException is thrown - return; - } - jlong jvalue_capacity = env->GetDirectBufferCapacity(jvalue_bytebuf); - if (jvalue_capacity == -1) { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, - "Invalid value(s) argument (argument is not a valid direct " - "ByteBuffer)"); - return; - } - void* jvalue_address = env->GetDirectBufferAddress(jvalue_bytebuf); - if (jvalue_address == nullptr) { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, - "Invalid value(s) argument (argument is not a valid direct " - "ByteBuffer)"); - return; - } - - // record num returned, push back that number, which may be bigger then - // the ByteBuffer supplied. then copy as much as fits in the ByteBuffer. - value_size.push_back(static_cast(values[i].size())); - auto copy_bytes = - std::min(static_cast(values[i].size()), jvalue_capacity); - memcpy(jvalue_address, values[i].data(), copy_bytes); - } else { - // bad status for this - value_size.push_back(0); - } - } - - env->SetIntArrayRegion(jvalue_sizes, 0, num_keys, value_size.data()); -} - /* + * @brief Use the efficient/optimized variant of MultiGet() + * * Class: org_rocksdb_RocksDB * Method: multiGet * Signature: (J[[B[I[I)[[B */ jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3I_3I( - JNIEnv* env, jobject jdb, jlong jdb_handle, jobjectArray jkeys, + JNIEnv* env, jclass, jlong jdb_handle, jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens) { - return multi_get_helper( - env, jdb, reinterpret_cast(jdb_handle), - ROCKSDB_NAMESPACE::ReadOptions(), jkeys, jkey_offs, jkey_lens, nullptr); + ROCKSDB_NAMESPACE::MultiGetJNIKeys keys; + if (!keys.fromByteArrays(env, jkeys, jkey_offs, jkey_lens)) { + return nullptr; + } + std::vector values(keys.size()); + std::vector statuses(keys.size()); + auto* db = reinterpret_cast(jdb_handle); + db->MultiGet(ROCKSDB_NAMESPACE::ReadOptions(), db->DefaultColumnFamily(), + keys.size(), keys.data(), values.data(), statuses.data(), + false /* sorted_input*/); + return ROCKSDB_NAMESPACE::MultiGetJNIValues::byteArrays(env, values, + statuses); } /* + * @brief Use the efficient/optimized variant of MultiGet() + * * Class: org_rocksdb_RocksDB * Method: multiGet * Signature: (J[[B[I[I[J)[[B */ jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3I_3I_3J( - JNIEnv* env, jobject jdb, jlong jdb_handle, jobjectArray jkeys, + JNIEnv* env, jclass, jlong jdb_handle, jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens, jlongArray jcolumn_family_handles) { - return multi_get_helper(env, jdb, - reinterpret_cast(jdb_handle), - ROCKSDB_NAMESPACE::ReadOptions(), jkeys, jkey_offs, - jkey_lens, jcolumn_family_handles); + ROCKSDB_NAMESPACE::MultiGetJNIKeys keys; + if (!keys.fromByteArrays(env, jkeys, jkey_offs, jkey_lens)) return nullptr; + auto cf_handles = + ROCKSDB_NAMESPACE::ColumnFamilyJNIHelpers::handlesFromJLongArray( + env, jcolumn_family_handles); + if (!cf_handles) return nullptr; + std::vector values(keys.size()); + std::vector statuses(keys.size()); + auto* db = reinterpret_cast(jdb_handle); + db->MultiGet(ROCKSDB_NAMESPACE::ReadOptions(), keys.size(), + cf_handles->data(), keys.data(), values.data(), statuses.data(), + /* sorted_input */ false); + + return ROCKSDB_NAMESPACE::MultiGetJNIValues::byteArrays(env, values, + statuses); } /* + * @brief Use the efficient/optimized variant of MultiGet() + * * Class: org_rocksdb_RocksDB * Method: multiGet * Signature: (JJ[[B[I[I)[[B */ jobjectArray Java_org_rocksdb_RocksDB_multiGet__JJ_3_3B_3I_3I( - JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jropt_handle, jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens) { - return multi_get_helper( - env, jdb, reinterpret_cast(jdb_handle), - *reinterpret_cast(jropt_handle), jkeys, - jkey_offs, jkey_lens, nullptr); + ROCKSDB_NAMESPACE::MultiGetJNIKeys keys; + if (!keys.fromByteArrays(env, jkeys, jkey_offs, jkey_lens)) { + return nullptr; + } + std::vector values(keys.size()); + std::vector statuses(keys.size()); + auto* db = reinterpret_cast(jdb_handle); + db->MultiGet(*reinterpret_cast(jropt_handle), + db->DefaultColumnFamily(), keys.size(), keys.data(), + values.data(), statuses.data(), false /* sorted_input*/); + return ROCKSDB_NAMESPACE::MultiGetJNIValues::byteArrays(env, values, + statuses); } /* + * @brief Use the efficient/optimized variant of MultiGet() + * * Class: org_rocksdb_RocksDB * Method: multiGet * Signature: (JJ[[B[I[I[J)[[B */ jobjectArray Java_org_rocksdb_RocksDB_multiGet__JJ_3_3B_3I_3I_3J( - JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jropt_handle, jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens, jlongArray jcolumn_family_handles) { - return multi_get_helper( - env, jdb, reinterpret_cast(jdb_handle), - *reinterpret_cast(jropt_handle), jkeys, - jkey_offs, jkey_lens, jcolumn_family_handles); + ROCKSDB_NAMESPACE::MultiGetJNIKeys keys; + if (!keys.fromByteArrays(env, jkeys, jkey_offs, jkey_lens)) return nullptr; + auto cf_handles = + ROCKSDB_NAMESPACE::ColumnFamilyJNIHelpers::handlesFromJLongArray( + env, jcolumn_family_handles); + if (!cf_handles) return nullptr; + std::vector values(keys.size()); + std::vector statuses(keys.size()); + auto* db = reinterpret_cast(jdb_handle); + db->MultiGet(*reinterpret_cast(jropt_handle), + keys.size(), cf_handles->data(), keys.data(), values.data(), + statuses.data(), + /* sorted_input */ false); + + return ROCKSDB_NAMESPACE::MultiGetJNIValues::byteArrays(env, values, + statuses); } /* + * @brief Use the efficient/optimized variant of MultiGet() + * + * Should make use of fast-path multiget (io_uring) on Linux + * * Class: org_rocksdb_RocksDB * Method: multiGet * Signature: * (JJ[J[Ljava/nio/ByteBuffer;[I[I[Ljava/nio/ByteBuffer;[I[Lorg/rocksdb/Status;)V */ void Java_org_rocksdb_RocksDB_multiGet__JJ_3J_3Ljava_nio_ByteBuffer_2_3I_3I_3Ljava_nio_ByteBuffer_2_3I_3Lorg_rocksdb_Status_2( - JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle, - jlongArray jcolumn_family_handles, jobjectArray jkeys, - jintArray jkey_offsets, jintArray jkey_lengths, jobjectArray jvalues, - jintArray jvalues_sizes, jobjectArray jstatus_objects) { - return multi_get_helper_direct( - env, jdb, reinterpret_cast(jdb_handle), - *reinterpret_cast(jropt_handle), - jcolumn_family_handles, jkeys, jkey_offsets, jkey_lengths, jvalues, - jvalues_sizes, jstatus_objects); + JNIEnv* env, jclass, jlong jdb_handle, jlong jropt_handle, + jlongArray jcolumn_family_handles, jobjectArray jkeys, jintArray jkey_offs, + jintArray jkey_lens, jobjectArray jvalues, jintArray jvalues_sizes, + jobjectArray jstatus_objects) { + ROCKSDB_NAMESPACE::MultiGetJNIKeys keys; + if (!keys.fromByteBuffers(env, jkeys, jkey_offs, jkey_lens)) { + // exception thrown + return; + } + auto cf_handles = + ROCKSDB_NAMESPACE::ColumnFamilyJNIHelpers::handlesFromJLongArray( + env, jcolumn_family_handles); + std::vector values(keys.size()); + std::vector statuses(keys.size()); + auto* db = reinterpret_cast(jdb_handle); + auto ro = *reinterpret_cast(jropt_handle); + if (cf_handles->size() == 0) { + db->MultiGet(ro, db->DefaultColumnFamily(), keys.size(), keys.data(), + values.data(), statuses.data(), false /* sorted_input*/); + } else if (cf_handles->size() == 1) { + db->MultiGet(ro, cf_handles->data()[0], keys.size(), keys.data(), + values.data(), statuses.data(), false /* sorted_input*/); + } else { + db->MultiGet(ro, keys.size(), cf_handles->data(), keys.data(), + values.data(), statuses.data(), + /* sorted_input */ false); + } + ROCKSDB_NAMESPACE::MultiGetJNIValues::fillByteBuffersAndStatusObjects( + env, values, statuses, jvalues, jvalues_sizes, jstatus_objects); } -// private native void -// multiGet(final long dbHandle, final long rOptHandle, -// final long[] columnFamilyHandles, final ByteBuffer[] keysArray, -// final ByteBuffer[] valuesArray); ////////////////////////////////////////////////////////////////////////////// // ROCKSDB_NAMESPACE::DB::KeyMayExist @@ -2382,7 +1950,7 @@ jboolean key_exists_helper(JNIEnv* env, jlong jdb_handle, jlong jcf_handle, * Method: keyExist * Signature: (JJJ[BII)Z */ -jboolean Java_org_rocksdb_RocksDB_keyExists(JNIEnv* env, jobject, +jboolean Java_org_rocksdb_RocksDB_keyExists(JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jlong jread_opts_handle, jbyteArray jkey, jint jkey_offset, @@ -2413,7 +1981,7 @@ jboolean Java_org_rocksdb_RocksDB_keyExists(JNIEnv* env, jobject, * Signature: (JJJLjava/nio/ByteBuffer;II)Z */ jboolean Java_org_rocksdb_RocksDB_keyExistsDirect( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jlong jread_opts_handle, jobject jkey, jint jkey_offset, jint jkey_len) { char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); if (key == nullptr) { @@ -2440,7 +2008,7 @@ jboolean Java_org_rocksdb_RocksDB_keyExistsDirect( * Signature: (JJJ[BII)Z */ jboolean Java_org_rocksdb_RocksDB_keyMayExist( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jlong jread_opts_handle, jbyteArray jkey, jint jkey_offset, jint jkey_len) { bool has_exception = false; std::string value; @@ -2464,7 +2032,7 @@ jboolean Java_org_rocksdb_RocksDB_keyMayExist( * Signature: (JJJLjava/nio/ByteBuffer;II)Z */ jboolean Java_org_rocksdb_RocksDB_keyMayExistDirect( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jlong jread_opts_handle, jobject jkey, jint jkey_offset, jint jkey_len) { bool has_exception = false; std::string value; @@ -2488,7 +2056,7 @@ jboolean Java_org_rocksdb_RocksDB_keyMayExistDirect( * (JJJLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;II)[J */ jintArray Java_org_rocksdb_RocksDB_keyMayExistDirectFoundValue( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jlong jread_opts_handle, jobject jkey, jint jkey_offset, jint jkey_len, jobject jval, jint jval_offset, jint jval_len) { char* val_buffer = reinterpret_cast(env->GetDirectBufferAddress(jval)); @@ -2559,7 +2127,7 @@ jintArray Java_org_rocksdb_RocksDB_keyMayExistDirectFoundValue( * Signature: (JJJ[BII)[[B */ jobjectArray Java_org_rocksdb_RocksDB_keyMayExistFoundValue( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jlong jread_opts_handle, jbyteArray jkey, jint jkey_offset, jint jkey_len) { bool has_exception = false; std::string value; @@ -2644,55 +2212,17 @@ jobjectArray Java_org_rocksdb_RocksDB_keyMayExistFoundValue( /* * Class: org_rocksdb_RocksDB * Method: iterator - * Signature: (J)J - */ -jlong Java_org_rocksdb_RocksDB_iterator__J(JNIEnv*, jobject, jlong db_handle) { - auto* db = reinterpret_cast(db_handle); - return rocksdb_iterator_helper(db, ROCKSDB_NAMESPACE::ReadOptions(), nullptr); -} - -/* - * Class: org_rocksdb_RocksDB - * Method: iterator - * Signature: (JJ)J - */ -jlong Java_org_rocksdb_RocksDB_iterator__JJ(JNIEnv*, jobject, jlong db_handle, - jlong jread_options_handle) { - auto* db = reinterpret_cast(db_handle); - auto& read_options = - *reinterpret_cast(jread_options_handle); - return rocksdb_iterator_helper(db, read_options, nullptr); -} - -/* - * Class: org_rocksdb_RocksDB - * Method: iteratorCF - * Signature: (JJ)J - */ -jlong Java_org_rocksdb_RocksDB_iteratorCF__JJ(JNIEnv*, jobject, jlong db_handle, - jlong jcf_handle) { - auto* db = reinterpret_cast(db_handle); - auto* cf_handle = - reinterpret_cast(jcf_handle); - return rocksdb_iterator_helper(db, ROCKSDB_NAMESPACE::ReadOptions(), - cf_handle); -} - -/* - * Class: org_rocksdb_RocksDB - * Method: iteratorCF * Signature: (JJJ)J */ -jlong Java_org_rocksdb_RocksDB_iteratorCF__JJJ(JNIEnv*, jobject, - jlong db_handle, - jlong jcf_handle, - jlong jread_options_handle) { +jlong Java_org_rocksdb_RocksDB_iterator(JNIEnv*, jclass, jlong db_handle, + jlong jcf_handle, + jlong jread_options_handle) { auto* db = reinterpret_cast(db_handle); auto* cf_handle = reinterpret_cast(jcf_handle); auto& read_options = *reinterpret_cast(jread_options_handle); - return rocksdb_iterator_helper(db, read_options, cf_handle); + return GET_CPLUSPLUS_POINTER(db->NewIterator(read_options, cf_handle)); } /* @@ -2700,7 +2230,7 @@ jlong Java_org_rocksdb_RocksDB_iteratorCF__JJJ(JNIEnv*, jobject, * Method: iterators * Signature: (J[JJ)[J */ -jlongArray Java_org_rocksdb_RocksDB_iterators(JNIEnv* env, jobject, +jlongArray Java_org_rocksdb_RocksDB_iterators(JNIEnv* env, jclass, jlong db_handle, jlongArray jcolumn_family_handles, jlong jread_options_handle) { @@ -2760,7 +2290,7 @@ jlongArray Java_org_rocksdb_RocksDB_iterators(JNIEnv* env, jobject, * Method: getSnapshot * Signature: (J)J */ -jlong Java_org_rocksdb_RocksDB_getSnapshot(JNIEnv*, jobject, jlong db_handle) { +jlong Java_org_rocksdb_RocksDB_getSnapshot(JNIEnv*, jclass, jlong db_handle) { auto* db = reinterpret_cast(db_handle); const ROCKSDB_NAMESPACE::Snapshot* snapshot = db->GetSnapshot(); return GET_CPLUSPLUS_POINTER(snapshot); @@ -2770,7 +2300,7 @@ jlong Java_org_rocksdb_RocksDB_getSnapshot(JNIEnv*, jobject, jlong db_handle) { * Method: releaseSnapshot * Signature: (JJ)V */ -void Java_org_rocksdb_RocksDB_releaseSnapshot(JNIEnv*, jobject, jlong db_handle, +void Java_org_rocksdb_RocksDB_releaseSnapshot(JNIEnv*, jclass, jlong db_handle, jlong snapshot_handle) { auto* db = reinterpret_cast(db_handle); auto* snapshot = @@ -2783,7 +2313,7 @@ void Java_org_rocksdb_RocksDB_releaseSnapshot(JNIEnv*, jobject, jlong db_handle, * Method: getProperty * Signature: (JJLjava/lang/String;I)Ljava/lang/String; */ -jstring Java_org_rocksdb_RocksDB_getProperty(JNIEnv* env, jobject, +jstring Java_org_rocksdb_RocksDB_getProperty(JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jstring jproperty, jint jproperty_len) { @@ -2821,7 +2351,7 @@ jstring Java_org_rocksdb_RocksDB_getProperty(JNIEnv* env, jobject, * Method: getMapProperty * Signature: (JJLjava/lang/String;I)Ljava/util/Map; */ -jobject Java_org_rocksdb_RocksDB_getMapProperty(JNIEnv* env, jobject, +jobject Java_org_rocksdb_RocksDB_getMapProperty(JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jstring jproperty, @@ -2860,7 +2390,7 @@ jobject Java_org_rocksdb_RocksDB_getMapProperty(JNIEnv* env, jobject, * Method: getLongProperty * Signature: (JJLjava/lang/String;I)J */ -jlong Java_org_rocksdb_RocksDB_getLongProperty(JNIEnv* env, jobject, +jlong Java_org_rocksdb_RocksDB_getLongProperty(JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jstring jproperty, @@ -2899,7 +2429,7 @@ jlong Java_org_rocksdb_RocksDB_getLongProperty(JNIEnv* env, jobject, * Method: resetStats * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_resetStats(JNIEnv*, jobject, jlong jdb_handle) { +void Java_org_rocksdb_RocksDB_resetStats(JNIEnv*, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); db->ResetStats(); } @@ -2909,7 +2439,7 @@ void Java_org_rocksdb_RocksDB_resetStats(JNIEnv*, jobject, jlong jdb_handle) { * Method: getAggregatedLongProperty * Signature: (JLjava/lang/String;I)J */ -jlong Java_org_rocksdb_RocksDB_getAggregatedLongProperty(JNIEnv* env, jobject, +jlong Java_org_rocksdb_RocksDB_getAggregatedLongProperty(JNIEnv* env, jclass, jlong db_handle, jstring jproperty, jint jproperty_len) { @@ -2938,7 +2468,7 @@ jlong Java_org_rocksdb_RocksDB_getAggregatedLongProperty(JNIEnv* env, jobject, * Signature: (JJ[JB)[J */ jlongArray Java_org_rocksdb_RocksDB_getApproximateSizes( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jlongArray jrange_slice_handles, jbyte jinclude_flags) { const jsize jlen = env->GetArrayLength(jrange_slice_handles); const size_t range_count = jlen / 2; @@ -3017,8 +2547,8 @@ jlongArray Java_org_rocksdb_RocksDB_getApproximateSizes( * Signature: (JJJJ)[J */ jlongArray Java_org_rocksdb_RocksDB_getApproximateMemTableStats( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, - jlong jstartHandle, jlong jlimitHandle) { + JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jlong jstartHandle, + jlong jlimitHandle) { auto* start = reinterpret_cast(jstartHandle); auto* limit = reinterpret_cast(jlimitHandle); const ROCKSDB_NAMESPACE::Range range(*start, *limit); @@ -3060,7 +2590,7 @@ jlongArray Java_org_rocksdb_RocksDB_getApproximateMemTableStats( * Method: compactRange * Signature: (J[BI[BIJJ)V */ -void Java_org_rocksdb_RocksDB_compactRange(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_compactRange(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jbegin, jint jbegin_len, jbyteArray jend, jint jend_len, @@ -3114,17 +2644,20 @@ void Java_org_rocksdb_RocksDB_compactRange(JNIEnv* env, jobject, } ROCKSDB_NAMESPACE::Status s; - if (jbegin_len > 0 || jend_len > 0) { - const ROCKSDB_NAMESPACE::Slice begin(str_begin); - const ROCKSDB_NAMESPACE::Slice end(str_end); - s = db->CompactRange(*compact_range_opts, cf_handle, &begin, &end); - } else { - s = db->CompactRange(*compact_range_opts, cf_handle, nullptr, nullptr); - } + std::unique_ptr begin; + std::unique_ptr end; + if (jbegin_len > 0) { + begin.reset(new ROCKSDB_NAMESPACE::Slice(str_begin)); + } + if (jend_len > 0) { + end.reset(new ROCKSDB_NAMESPACE::Slice(str_end)); + } + s = db->CompactRange(*compact_range_opts, cf_handle, begin.get(), end.get()); if (jcompact_range_opts_handle == 0) { delete compact_range_opts; } + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); } @@ -3133,7 +2666,7 @@ void Java_org_rocksdb_RocksDB_compactRange(JNIEnv* env, jobject, * Method: setOptions * Signature: (JJ[Ljava/lang/String;[Ljava/lang/String;)V */ -void Java_org_rocksdb_RocksDB_setOptions(JNIEnv* env, jobject, jlong jdb_handle, +void Java_org_rocksdb_RocksDB_setOptions(JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jobjectArray jkeys, jobjectArray jvalues) { const jsize len = env->GetArrayLength(jkeys); @@ -3196,7 +2729,7 @@ void Java_org_rocksdb_RocksDB_setOptions(JNIEnv* env, jobject, jlong jdb_handle, * Method: setDBOptions * Signature: (J[Ljava/lang/String;[Ljava/lang/String;)V */ -void Java_org_rocksdb_RocksDB_setDBOptions(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_setDBOptions(JNIEnv* env, jclass, jlong jdb_handle, jobjectArray jkeys, jobjectArray jvalues) { const jsize len = env->GetArrayLength(jkeys); @@ -3254,7 +2787,7 @@ void Java_org_rocksdb_RocksDB_setDBOptions(JNIEnv* env, jobject, * Method: getOptions * Signature: (JJ)Ljava/lang/String; */ -jstring Java_org_rocksdb_RocksDB_getOptions(JNIEnv* env, jobject, +jstring Java_org_rocksdb_RocksDB_getOptions(JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -3283,7 +2816,7 @@ jstring Java_org_rocksdb_RocksDB_getOptions(JNIEnv* env, jobject, * Method: getDBOptions * Signature: (J)Ljava/lang/String; */ -jstring Java_org_rocksdb_RocksDB_getDBOptions(JNIEnv* env, jobject, +jstring Java_org_rocksdb_RocksDB_getDBOptions(JNIEnv* env, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -3303,8 +2836,7 @@ jstring Java_org_rocksdb_RocksDB_getDBOptions(JNIEnv* env, jobject, * Method: setPerfLevel * Signature: (JB)V */ -void Java_org_rocksdb_RocksDB_setPerfLevel(JNIEnv*, jobject, - jbyte jperf_level) { +void Java_org_rocksdb_RocksDB_setPerfLevel(JNIEnv*, jclass, jbyte jperf_level) { rocksdb::SetPerfLevel( ROCKSDB_NAMESPACE::PerfLevelTypeJni::toCppPerfLevelType(jperf_level)); } @@ -3314,7 +2846,7 @@ void Java_org_rocksdb_RocksDB_setPerfLevel(JNIEnv*, jobject, * Method: getPerfLevel * Signature: (J)B */ -jbyte Java_org_rocksdb_RocksDB_getPerfLevelNative(JNIEnv*, jobject) { +jbyte Java_org_rocksdb_RocksDB_getPerfLevelNative(JNIEnv*, jclass) { return ROCKSDB_NAMESPACE::PerfLevelTypeJni::toJavaPerfLevelType( rocksdb::GetPerfLevel()); } @@ -3324,7 +2856,7 @@ jbyte Java_org_rocksdb_RocksDB_getPerfLevelNative(JNIEnv*, jobject) { * Method: getPerfContextNative * Signature: ()J */ -jlong Java_org_rocksdb_RocksDB_getPerfContextNative(JNIEnv*, jobject) { +jlong Java_org_rocksdb_RocksDB_getPerfContextNative(JNIEnv*, jclass) { ROCKSDB_NAMESPACE::PerfContext* perf_context = rocksdb::get_perf_context(); return reinterpret_cast(perf_context); } @@ -3335,7 +2867,7 @@ jlong Java_org_rocksdb_RocksDB_getPerfContextNative(JNIEnv*, jobject) { * Signature: (JJJ[Ljava/lang/String;IIJ)[Ljava/lang/String; */ jobjectArray Java_org_rocksdb_RocksDB_compactFiles( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcompaction_opts_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jcompaction_opts_handle, jlong jcf_handle, jobjectArray jinput_file_names, jint joutput_level, jint joutput_path_id, jlong jcompaction_job_info_handle) { jboolean has_exception = JNI_FALSE; @@ -3384,7 +2916,7 @@ jobjectArray Java_org_rocksdb_RocksDB_compactFiles( * Method: cancelAllBackgroundWork * Signature: (JZ)V */ -void Java_org_rocksdb_RocksDB_cancelAllBackgroundWork(JNIEnv*, jobject, +void Java_org_rocksdb_RocksDB_cancelAllBackgroundWork(JNIEnv*, jclass, jlong jdb_handle, jboolean jwait) { auto* db = reinterpret_cast(jdb_handle); @@ -3396,7 +2928,7 @@ void Java_org_rocksdb_RocksDB_cancelAllBackgroundWork(JNIEnv*, jobject, * Method: pauseBackgroundWork * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_pauseBackgroundWork(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_pauseBackgroundWork(JNIEnv* env, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); auto s = db->PauseBackgroundWork(); @@ -3410,7 +2942,7 @@ void Java_org_rocksdb_RocksDB_pauseBackgroundWork(JNIEnv* env, jobject, * Method: continueBackgroundWork * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_continueBackgroundWork(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_continueBackgroundWork(JNIEnv* env, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); auto s = db->ContinueBackgroundWork(); @@ -3424,7 +2956,7 @@ void Java_org_rocksdb_RocksDB_continueBackgroundWork(JNIEnv* env, jobject, * Method: enableAutoCompaction * Signature: (J[J)V */ -void Java_org_rocksdb_RocksDB_enableAutoCompaction(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_enableAutoCompaction(JNIEnv* env, jclass, jlong jdb_handle, jlongArray jcf_handles) { auto* db = reinterpret_cast(jdb_handle); @@ -3445,7 +2977,7 @@ void Java_org_rocksdb_RocksDB_enableAutoCompaction(JNIEnv* env, jobject, * Method: numberLevels * Signature: (JJ)I */ -jint Java_org_rocksdb_RocksDB_numberLevels(JNIEnv*, jobject, jlong jdb_handle, +jint Java_org_rocksdb_RocksDB_numberLevels(JNIEnv*, jclass, jlong jdb_handle, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; @@ -3463,7 +2995,7 @@ jint Java_org_rocksdb_RocksDB_numberLevels(JNIEnv*, jobject, jlong jdb_handle, * Method: maxMemCompactionLevel * Signature: (JJ)I */ -jint Java_org_rocksdb_RocksDB_maxMemCompactionLevel(JNIEnv*, jobject, +jint Java_org_rocksdb_RocksDB_maxMemCompactionLevel(JNIEnv*, jclass, jlong jdb_handle, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -3482,7 +3014,7 @@ jint Java_org_rocksdb_RocksDB_maxMemCompactionLevel(JNIEnv*, jobject, * Method: level0StopWriteTrigger * Signature: (JJ)I */ -jint Java_org_rocksdb_RocksDB_level0StopWriteTrigger(JNIEnv*, jobject, +jint Java_org_rocksdb_RocksDB_level0StopWriteTrigger(JNIEnv*, jclass, jlong jdb_handle, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -3501,7 +3033,7 @@ jint Java_org_rocksdb_RocksDB_level0StopWriteTrigger(JNIEnv*, jobject, * Method: getName * Signature: (J)Ljava/lang/String; */ -jstring Java_org_rocksdb_RocksDB_getName(JNIEnv* env, jobject, +jstring Java_org_rocksdb_RocksDB_getName(JNIEnv* env, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); std::string name = db->GetName(); @@ -3513,7 +3045,7 @@ jstring Java_org_rocksdb_RocksDB_getName(JNIEnv* env, jobject, * Method: getEnv * Signature: (J)J */ -jlong Java_org_rocksdb_RocksDB_getEnv(JNIEnv*, jobject, jlong jdb_handle) { +jlong Java_org_rocksdb_RocksDB_getEnv(JNIEnv*, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); return GET_CPLUSPLUS_POINTER(db->GetEnv()); } @@ -3523,7 +3055,7 @@ jlong Java_org_rocksdb_RocksDB_getEnv(JNIEnv*, jobject, jlong jdb_handle) { * Method: flush * Signature: (JJ[J)V */ -void Java_org_rocksdb_RocksDB_flush(JNIEnv* env, jobject, jlong jdb_handle, +void Java_org_rocksdb_RocksDB_flush(JNIEnv* env, jclass, jlong jdb_handle, jlong jflush_opts_handle, jlongArray jcf_handles) { auto* db = reinterpret_cast(jdb_handle); @@ -3553,7 +3085,7 @@ void Java_org_rocksdb_RocksDB_flush(JNIEnv* env, jobject, jlong jdb_handle, * Method: flushWal * Signature: (JZ)V */ -void Java_org_rocksdb_RocksDB_flushWal(JNIEnv* env, jobject, jlong jdb_handle, +void Java_org_rocksdb_RocksDB_flushWal(JNIEnv* env, jclass, jlong jdb_handle, jboolean jsync) { auto* db = reinterpret_cast(jdb_handle); auto s = db->FlushWAL(jsync == JNI_TRUE); @@ -3567,7 +3099,7 @@ void Java_org_rocksdb_RocksDB_flushWal(JNIEnv* env, jobject, jlong jdb_handle, * Method: syncWal * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_syncWal(JNIEnv* env, jobject, jlong jdb_handle) { +void Java_org_rocksdb_RocksDB_syncWal(JNIEnv* env, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); auto s = db->SyncWAL(); if (!s.ok()) { @@ -3580,7 +3112,7 @@ void Java_org_rocksdb_RocksDB_syncWal(JNIEnv* env, jobject, jlong jdb_handle) { * Method: getLatestSequenceNumber * Signature: (J)V */ -jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber(JNIEnv*, jobject, +jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber(JNIEnv*, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); return db->GetLatestSequenceNumber(); @@ -3591,7 +3123,7 @@ jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber(JNIEnv*, jobject, * Method: disableFileDeletions * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_disableFileDeletions(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_disableFileDeletions(JNIEnv* env, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::Status s = db->DisableFileDeletions(); @@ -3605,11 +3137,10 @@ void Java_org_rocksdb_RocksDB_disableFileDeletions(JNIEnv* env, jobject, * Method: enableFileDeletions * Signature: (JZ)V */ -void Java_org_rocksdb_RocksDB_enableFileDeletions(JNIEnv* env, jobject, - jlong jdb_handle, - jboolean jforce) { +void Java_org_rocksdb_RocksDB_enableFileDeletions(JNIEnv* env, jclass, + jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); - ROCKSDB_NAMESPACE::Status s = db->EnableFileDeletions(jforce); + ROCKSDB_NAMESPACE::Status s = db->EnableFileDeletions(); if (!s.ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); } @@ -3620,7 +3151,7 @@ void Java_org_rocksdb_RocksDB_enableFileDeletions(JNIEnv* env, jobject, * Method: getLiveFiles * Signature: (JZ)[Ljava/lang/String; */ -jobjectArray Java_org_rocksdb_RocksDB_getLiveFiles(JNIEnv* env, jobject, +jobjectArray Java_org_rocksdb_RocksDB_getLiveFiles(JNIEnv* env, jclass, jlong jdb_handle, jboolean jflush_memtable) { auto* db = reinterpret_cast(jdb_handle); @@ -3645,7 +3176,7 @@ jobjectArray Java_org_rocksdb_RocksDB_getLiveFiles(JNIEnv* env, jobject, * Method: getSortedWalFiles * Signature: (J)[Lorg/rocksdb/LogFile; */ -jobjectArray Java_org_rocksdb_RocksDB_getSortedWalFiles(JNIEnv* env, jobject, +jobjectArray Java_org_rocksdb_RocksDB_getSortedWalFiles(JNIEnv* env, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); std::vector> sorted_wal_files; @@ -3693,7 +3224,7 @@ jobjectArray Java_org_rocksdb_RocksDB_getSortedWalFiles(JNIEnv* env, jobject, * Method: getUpdatesSince * Signature: (JJ)J */ -jlong Java_org_rocksdb_RocksDB_getUpdatesSince(JNIEnv* env, jobject, +jlong Java_org_rocksdb_RocksDB_getUpdatesSince(JNIEnv* env, jclass, jlong jdb_handle, jlong jsequence_number) { auto* db = reinterpret_cast(jdb_handle); @@ -3714,7 +3245,7 @@ jlong Java_org_rocksdb_RocksDB_getUpdatesSince(JNIEnv* env, jobject, * Method: deleteFile * Signature: (JLjava/lang/String;)V */ -void Java_org_rocksdb_RocksDB_deleteFile(JNIEnv* env, jobject, jlong jdb_handle, +void Java_org_rocksdb_RocksDB_deleteFile(JNIEnv* env, jclass, jlong jdb_handle, jstring jname) { auto* db = reinterpret_cast(jdb_handle); jboolean has_exception = JNI_FALSE; @@ -3732,7 +3263,7 @@ void Java_org_rocksdb_RocksDB_deleteFile(JNIEnv* env, jobject, jlong jdb_handle, * Method: getLiveFilesMetaData * Signature: (J)[Lorg/rocksdb/LiveFileMetaData; */ -jobjectArray Java_org_rocksdb_RocksDB_getLiveFilesMetaData(JNIEnv* env, jobject, +jobjectArray Java_org_rocksdb_RocksDB_getLiveFilesMetaData(JNIEnv* env, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); std::vector live_files_meta_data; @@ -3779,7 +3310,7 @@ jobjectArray Java_org_rocksdb_RocksDB_getLiveFilesMetaData(JNIEnv* env, jobject, * Method: getColumnFamilyMetaData * Signature: (JJ)Lorg/rocksdb/ColumnFamilyMetaData; */ -jobject Java_org_rocksdb_RocksDB_getColumnFamilyMetaData(JNIEnv* env, jobject, +jobject Java_org_rocksdb_RocksDB_getColumnFamilyMetaData(JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -3802,7 +3333,7 @@ jobject Java_org_rocksdb_RocksDB_getColumnFamilyMetaData(JNIEnv* env, jobject, * Signature: (JJ[Ljava/lang/String;IJ)V */ void Java_org_rocksdb_RocksDB_ingestExternalFile( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jobjectArray jfile_path_list, jint jfile_path_list_len, jlong jingest_external_file_options_handle) { jboolean has_exception = JNI_FALSE; @@ -3831,7 +3362,7 @@ void Java_org_rocksdb_RocksDB_ingestExternalFile( * Method: verifyChecksum * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_verifyChecksum(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_verifyChecksum(JNIEnv* env, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); auto s = db->VerifyChecksum(); @@ -3845,7 +3376,7 @@ void Java_org_rocksdb_RocksDB_verifyChecksum(JNIEnv* env, jobject, * Method: getDefaultColumnFamily * Signature: (J)J */ -jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily(JNIEnv*, jobject, +jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily(JNIEnv*, jclass, jlong jdb_handle) { auto* db_handle = reinterpret_cast(jdb_handle); auto* cf_handle = db_handle->DefaultColumnFamily(); @@ -3857,7 +3388,7 @@ jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily(JNIEnv*, jobject, * Method: getPropertiesOfAllTables * Signature: (JJ)Ljava/util/Map; */ -jobject Java_org_rocksdb_RocksDB_getPropertiesOfAllTables(JNIEnv* env, jobject, +jobject Java_org_rocksdb_RocksDB_getPropertiesOfAllTables(JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -3930,7 +3461,7 @@ jobject Java_org_rocksdb_RocksDB_getPropertiesOfAllTables(JNIEnv* env, jobject, * Signature: (JJ[J)Ljava/util/Map; */ jobject Java_org_rocksdb_RocksDB_getPropertiesOfTablesInRange( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle, jlongArray jrange_slice_handles) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; @@ -3982,7 +3513,7 @@ jobject Java_org_rocksdb_RocksDB_getPropertiesOfTablesInRange( * Method: suggestCompactRange * Signature: (JJ)[J */ -jlongArray Java_org_rocksdb_RocksDB_suggestCompactRange(JNIEnv* env, jobject, +jlongArray Java_org_rocksdb_RocksDB_suggestCompactRange(JNIEnv* env, jclass, jlong jdb_handle, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); @@ -4032,7 +3563,7 @@ jlongArray Java_org_rocksdb_RocksDB_suggestCompactRange(JNIEnv* env, jobject, * Method: promoteL0 * Signature: (JJI)V */ -void Java_org_rocksdb_RocksDB_promoteL0(JNIEnv*, jobject, jlong jdb_handle, +void Java_org_rocksdb_RocksDB_promoteL0(JNIEnv*, jclass, jlong jdb_handle, jlong jcf_handle, jint jtarget_level) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; @@ -4051,7 +3582,7 @@ void Java_org_rocksdb_RocksDB_promoteL0(JNIEnv*, jobject, jlong jdb_handle, * Signature: (JJJ)V */ void Java_org_rocksdb_RocksDB_startTrace( - JNIEnv* env, jobject, jlong jdb_handle, jlong jmax_trace_file_size, + JNIEnv* env, jclass, jlong jdb_handle, jlong jmax_trace_file_size, jlong jtrace_writer_jnicallback_handle) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::TraceOptions trace_options; @@ -4073,7 +3604,7 @@ void Java_org_rocksdb_RocksDB_startTrace( * Method: endTrace * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_endTrace(JNIEnv* env, jobject, jlong jdb_handle) { +void Java_org_rocksdb_RocksDB_endTrace(JNIEnv* env, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); auto s = db->EndTrace(); if (!s.ok()) { @@ -4086,7 +3617,7 @@ void Java_org_rocksdb_RocksDB_endTrace(JNIEnv* env, jobject, jlong jdb_handle) { * Method: tryCatchUpWithPrimary * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_tryCatchUpWithPrimary(JNIEnv* env, jobject, +void Java_org_rocksdb_RocksDB_tryCatchUpWithPrimary(JNIEnv* env, jclass, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); auto s = db->TryCatchUpWithPrimary(); @@ -4155,7 +3686,7 @@ bool get_slice_helper(JNIEnv* env, jobjectArray ranges, jsize index, * Method: deleteFilesInRanges * Signature: (JJLjava/util/List;Z)V */ -void Java_org_rocksdb_RocksDB_deleteFilesInRanges(JNIEnv* env, jobject /*jdb*/, +void Java_org_rocksdb_RocksDB_deleteFilesInRanges(JNIEnv* env, jclass /*jdb*/, jlong jdb_handle, jlong jcf_handle, jobjectArray ranges, diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc index 63c6b1b9fbb..583d745661c 100644 --- a/java/rocksjni/slice.cc +++ b/java/rocksjni/slice.cc @@ -55,7 +55,7 @@ jlong Java_org_rocksdb_AbstractSlice_createNewSliceFromString(JNIEnv* env, * Method: size0 * Signature: (J)I */ -jint Java_org_rocksdb_AbstractSlice_size0(JNIEnv* /*env*/, jobject /*jobj*/, +jint Java_org_rocksdb_AbstractSlice_size0(JNIEnv* /*env*/, jclass /*jcls*/, jlong handle) { const auto* slice = reinterpret_cast(handle); return static_cast(slice->size()); @@ -66,8 +66,8 @@ jint Java_org_rocksdb_AbstractSlice_size0(JNIEnv* /*env*/, jobject /*jobj*/, * Method: empty0 * Signature: (J)Z */ -jboolean Java_org_rocksdb_AbstractSlice_empty0(JNIEnv* /*env*/, - jobject /*jobj*/, jlong handle) { +jboolean Java_org_rocksdb_AbstractSlice_empty0(JNIEnv* /*env*/, jclass /*jcls*/, + jlong handle) { const auto* slice = reinterpret_cast(handle); return slice->empty(); } @@ -77,7 +77,7 @@ jboolean Java_org_rocksdb_AbstractSlice_empty0(JNIEnv* /*env*/, * Method: toString0 * Signature: (JZ)Ljava/lang/String; */ -jstring Java_org_rocksdb_AbstractSlice_toString0(JNIEnv* env, jobject /*jobj*/, +jstring Java_org_rocksdb_AbstractSlice_toString0(JNIEnv* env, jclass /*jobj*/, jlong handle, jboolean hex) { const auto* slice = reinterpret_cast(handle); const std::string s = slice->ToString(hex); @@ -89,7 +89,7 @@ jstring Java_org_rocksdb_AbstractSlice_toString0(JNIEnv* env, jobject /*jobj*/, * Method: compare0 * Signature: (JJ)I; */ -jint Java_org_rocksdb_AbstractSlice_compare0(JNIEnv* /*env*/, jobject /*jobj*/, +jint Java_org_rocksdb_AbstractSlice_compare0(JNIEnv* /*env*/, jclass /*jcls*/, jlong handle, jlong otherHandle) { const auto* slice = reinterpret_cast(handle); const auto* otherSlice = @@ -103,7 +103,7 @@ jint Java_org_rocksdb_AbstractSlice_compare0(JNIEnv* /*env*/, jobject /*jobj*/, * Signature: (JJ)Z; */ jboolean Java_org_rocksdb_AbstractSlice_startsWith0(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle, jlong otherHandle) { const auto* slice = reinterpret_cast(handle); @@ -117,9 +117,9 @@ jboolean Java_org_rocksdb_AbstractSlice_startsWith0(JNIEnv* /*env*/, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_AbstractSlice_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_AbstractSlice_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong handle) { delete reinterpret_cast(handle); } @@ -209,7 +209,7 @@ jbyteArray Java_org_rocksdb_Slice_data0(JNIEnv* env, jobject /*jobj*/, * Method: clear0 * Signature: (JZJ)V */ -void Java_org_rocksdb_Slice_clear0(JNIEnv* /*env*/, jobject /*jobj*/, +void Java_org_rocksdb_Slice_clear0(JNIEnv* /*env*/, jclass /*jcls*/, jlong handle, jboolean shouldRelease, jlong internalBufferOffset) { auto* slice = reinterpret_cast(handle); @@ -225,7 +225,7 @@ void Java_org_rocksdb_Slice_clear0(JNIEnv* /*env*/, jobject /*jobj*/, * Method: removePrefix0 * Signature: (JI)V */ -void Java_org_rocksdb_Slice_removePrefix0(JNIEnv* /*env*/, jobject /*jobj*/, +void Java_org_rocksdb_Slice_removePrefix0(JNIEnv* /*env*/, jclass /*jcls*/, jlong handle, jint length) { auto* slice = reinterpret_cast(handle); slice->remove_prefix(length); @@ -236,7 +236,7 @@ void Java_org_rocksdb_Slice_removePrefix0(JNIEnv* /*env*/, jobject /*jobj*/, * Method: setLength0 * Signature: (JI)V */ -void Java_org_rocksdb_DirectSlice_setLength0(JNIEnv* /*env*/, jobject /*jobj*/, +void Java_org_rocksdb_DirectSlice_setLength0(JNIEnv* /*env*/, jclass /*jcls*/, jlong handle, jint length) { auto* slice = reinterpret_cast(handle); slice->size_ = length; @@ -247,8 +247,8 @@ void Java_org_rocksdb_DirectSlice_setLength0(JNIEnv* /*env*/, jobject /*jobj*/, * Method: disposeInternalBuf * Signature: (JJ)V */ -void Java_org_rocksdb_Slice_disposeInternalBuf(JNIEnv* /*env*/, - jobject /*jobj*/, jlong handle, +void Java_org_rocksdb_Slice_disposeInternalBuf(JNIEnv* /*env*/, jclass /*jcls*/, + jlong handle, jlong internalBufferOffset) { const auto* slice = reinterpret_cast(handle); const char* buf = slice->data_ - internalBufferOffset; @@ -324,7 +324,7 @@ jobject Java_org_rocksdb_DirectSlice_data0(JNIEnv* env, jobject /*jobj*/, * Method: get0 * Signature: (JI)B */ -jbyte Java_org_rocksdb_DirectSlice_get0(JNIEnv* /*env*/, jobject /*jobj*/, +jbyte Java_org_rocksdb_DirectSlice_get0(JNIEnv* /*env*/, jclass /*jcls*/, jlong handle, jint offset) { const auto* slice = reinterpret_cast(handle); return (*slice)[offset]; @@ -335,7 +335,7 @@ jbyte Java_org_rocksdb_DirectSlice_get0(JNIEnv* /*env*/, jobject /*jobj*/, * Method: clear0 * Signature: (JZJ)V */ -void Java_org_rocksdb_DirectSlice_clear0(JNIEnv* /*env*/, jobject /*jobj*/, +void Java_org_rocksdb_DirectSlice_clear0(JNIEnv* /*env*/, jclass /*jcls*/, jlong handle, jboolean shouldRelease, jlong internalBufferOffset) { auto* slice = reinterpret_cast(handle); @@ -352,7 +352,7 @@ void Java_org_rocksdb_DirectSlice_clear0(JNIEnv* /*env*/, jobject /*jobj*/, * Signature: (JI)V */ void Java_org_rocksdb_DirectSlice_removePrefix0(JNIEnv* /*env*/, - jobject /*jobj*/, jlong handle, + jclass /*jcls*/, jlong handle, jint length) { auto* slice = reinterpret_cast(handle); slice->remove_prefix(length); @@ -364,7 +364,7 @@ void Java_org_rocksdb_DirectSlice_removePrefix0(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_DirectSlice_disposeInternalBuf( - JNIEnv* /*env*/, jobject /*jobj*/, jlong handle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong handle, jlong internalBufferOffset) { const auto* slice = reinterpret_cast(handle); const char* buf = slice->data_ - internalBufferOffset; diff --git a/java/rocksjni/snapshot.cc b/java/rocksjni/snapshot.cc index 2a1265a58a0..b677abe274b 100644 --- a/java/rocksjni/snapshot.cc +++ b/java/rocksjni/snapshot.cc @@ -19,7 +19,7 @@ * Signature: (J)J */ jlong Java_org_rocksdb_Snapshot_getSequenceNumber(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jsnapshot_handle) { auto* snapshot = reinterpret_cast(jsnapshot_handle); diff --git a/java/rocksjni/sst_file_manager.cc b/java/rocksjni/sst_file_manager.cc index c514368191f..4278c71e5ec 100644 --- a/java/rocksjni/sst_file_manager.cc +++ b/java/rocksjni/sst_file_manager.cc @@ -61,8 +61,7 @@ jlong Java_org_rocksdb_SstFileManager_newSstFileManager( * Signature: (JJ)V */ void Java_org_rocksdb_SstFileManager_setMaxAllowedSpaceUsage( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, - jlong jmax_allowed_space) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jmax_allowed_space) { auto* sptr_sst_file_manager = reinterpret_cast*>( jhandle); @@ -75,7 +74,7 @@ void Java_org_rocksdb_SstFileManager_setMaxAllowedSpaceUsage( * Signature: (JJ)V */ void Java_org_rocksdb_SstFileManager_setCompactionBufferSize( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jcompaction_buffer_size) { auto* sptr_sst_file_manager = reinterpret_cast*>( @@ -90,7 +89,7 @@ void Java_org_rocksdb_SstFileManager_setCompactionBufferSize( * Signature: (J)Z */ jboolean Java_org_rocksdb_SstFileManager_isMaxAllowedSpaceReached( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* sptr_sst_file_manager = reinterpret_cast*>( jhandle); @@ -104,7 +103,7 @@ jboolean Java_org_rocksdb_SstFileManager_isMaxAllowedSpaceReached( */ jboolean Java_org_rocksdb_SstFileManager_isMaxAllowedSpaceReachedIncludingCompactions( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* sptr_sst_file_manager = reinterpret_cast*>( jhandle); @@ -118,7 +117,7 @@ Java_org_rocksdb_SstFileManager_isMaxAllowedSpaceReachedIncludingCompactions( * Signature: (J)J */ jlong Java_org_rocksdb_SstFileManager_getTotalSize(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* sptr_sst_file_manager = reinterpret_cast*>( @@ -132,7 +131,7 @@ jlong Java_org_rocksdb_SstFileManager_getTotalSize(JNIEnv* /*env*/, * Signature: (J)Ljava/util/Map; */ jobject Java_org_rocksdb_SstFileManager_getTrackedFiles(JNIEnv* env, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* sptr_sst_file_manager = reinterpret_cast*>( @@ -186,7 +185,7 @@ jobject Java_org_rocksdb_SstFileManager_getTrackedFiles(JNIEnv* env, * Signature: (J)J */ jlong Java_org_rocksdb_SstFileManager_getDeleteRateBytesPerSecond( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* sptr_sst_file_manager = reinterpret_cast*>( jhandle); @@ -199,7 +198,7 @@ jlong Java_org_rocksdb_SstFileManager_getDeleteRateBytesPerSecond( * Signature: (JJ)V */ void Java_org_rocksdb_SstFileManager_setDeleteRateBytesPerSecond( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jlong jdelete_rate) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jdelete_rate) { auto* sptr_sst_file_manager = reinterpret_cast*>( jhandle); @@ -212,7 +211,7 @@ void Java_org_rocksdb_SstFileManager_setDeleteRateBytesPerSecond( * Signature: (J)D */ jdouble Java_org_rocksdb_SstFileManager_getMaxTrashDBRatio(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* sptr_sst_file_manager = reinterpret_cast*>( @@ -226,7 +225,7 @@ jdouble Java_org_rocksdb_SstFileManager_getMaxTrashDBRatio(JNIEnv* /*env*/, * Signature: (JD)V */ void Java_org_rocksdb_SstFileManager_setMaxTrashDBRatio(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jdouble jratio) { auto* sptr_sst_file_manager = @@ -240,9 +239,9 @@ void Java_org_rocksdb_SstFileManager_setMaxTrashDBRatio(JNIEnv* /*env*/, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_SstFileManager_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_SstFileManager_disposeInternalJni(JNIEnv* /*env*/, + jclass /*cls*/, + jlong jhandle) { auto* sptr_sst_file_manager = reinterpret_cast*>( jhandle); diff --git a/java/rocksjni/sst_file_reader_iterator.cc b/java/rocksjni/sst_file_reader_iterator.cc index 68fa4c37c8b..ab86180d80c 100644 --- a/java/rocksjni/sst_file_reader_iterator.cc +++ b/java/rocksjni/sst_file_reader_iterator.cc @@ -19,9 +19,9 @@ * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_SstFileReaderIterator_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_SstFileReaderIterator_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jobj*/, + jlong handle) { auto* it = reinterpret_cast(handle); assert(it != nullptr); delete it; @@ -32,9 +32,9 @@ void Java_org_rocksdb_SstFileReaderIterator_disposeInternal(JNIEnv* /*env*/, * Method: isValid0 * Signature: (J)Z */ -jboolean Java_org_rocksdb_SstFileReaderIterator_isValid0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +jboolean Java_org_rocksdb_SstFileReaderIterator_isValid0Jni(JNIEnv* /*env*/, + jclass /*cls*/, + jlong handle) { return reinterpret_cast(handle)->Valid(); } @@ -43,9 +43,9 @@ jboolean Java_org_rocksdb_SstFileReaderIterator_isValid0(JNIEnv* /*env*/, * Method: seekToFirst0 * Signature: (J)V */ -void Java_org_rocksdb_SstFileReaderIterator_seekToFirst0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_SstFileReaderIterator_seekToFirst0Jni(JNIEnv* /*env*/, + jclass /*cls*/, + jlong handle) { reinterpret_cast(handle)->SeekToFirst(); } @@ -54,9 +54,9 @@ void Java_org_rocksdb_SstFileReaderIterator_seekToFirst0(JNIEnv* /*env*/, * Method: seekToLast0 * Signature: (J)V */ -void Java_org_rocksdb_SstFileReaderIterator_seekToLast0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_SstFileReaderIterator_seekToLast0Jni(JNIEnv* /*env*/, + jclass /*cls*/, + jlong handle) { reinterpret_cast(handle)->SeekToLast(); } @@ -65,9 +65,9 @@ void Java_org_rocksdb_SstFileReaderIterator_seekToLast0(JNIEnv* /*env*/, * Method: next0 * Signature: (J)V */ -void Java_org_rocksdb_SstFileReaderIterator_next0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_SstFileReaderIterator_next0Jni(JNIEnv* /*env*/, + jclass /*cls*/, + jlong handle) { reinterpret_cast(handle)->Next(); } @@ -76,9 +76,9 @@ void Java_org_rocksdb_SstFileReaderIterator_next0(JNIEnv* /*env*/, * Method: prev0 * Signature: (J)V */ -void Java_org_rocksdb_SstFileReaderIterator_prev0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_SstFileReaderIterator_prev0Jni(JNIEnv* /*env*/, + jclass /*cls*/, + jlong handle) { reinterpret_cast(handle)->Prev(); } @@ -87,10 +87,11 @@ void Java_org_rocksdb_SstFileReaderIterator_prev0(JNIEnv* /*env*/, * Method: seek0 * Signature: (J[BI)V */ -void Java_org_rocksdb_SstFileReaderIterator_seek0(JNIEnv* env, jobject /*jobj*/, - jlong handle, - jbyteArray jtarget, - jint jtarget_len) { +void Java_org_rocksdb_SstFileReaderIterator_seek0Jni(JNIEnv* env, + jclass /*cls*/, + jlong handle, + jbyteArray jtarget, + jint jtarget_len) { jbyte* target = env->GetByteArrayElements(jtarget, nullptr); if (target == nullptr) { // exception thrown: OutOfMemoryError @@ -111,11 +112,11 @@ void Java_org_rocksdb_SstFileReaderIterator_seek0(JNIEnv* env, jobject /*jobj*/, * Method: seekForPrev0 * Signature: (J[BI)V */ -void Java_org_rocksdb_SstFileReaderIterator_seekForPrev0(JNIEnv* env, - jobject /*jobj*/, - jlong handle, - jbyteArray jtarget, - jint jtarget_len) { +void Java_org_rocksdb_SstFileReaderIterator_seekForPrev0Jni(JNIEnv* env, + jclass /*cls*/, + jlong handle, + jbyteArray jtarget, + jint jtarget_len) { jbyte* target = env->GetByteArrayElements(jtarget, nullptr); if (target == nullptr) { // exception thrown: OutOfMemoryError @@ -136,9 +137,9 @@ void Java_org_rocksdb_SstFileReaderIterator_seekForPrev0(JNIEnv* env, * Method: status0 * Signature: (J)V */ -void Java_org_rocksdb_SstFileReaderIterator_status0(JNIEnv* env, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_SstFileReaderIterator_status0Jni(JNIEnv* env, + jclass /*cls*/, + jlong handle) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Status s = it->status(); @@ -155,7 +156,7 @@ void Java_org_rocksdb_SstFileReaderIterator_status0(JNIEnv* env, * Signature: (J)[B */ jbyteArray Java_org_rocksdb_SstFileReaderIterator_key0(JNIEnv* env, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Slice key_slice = it->key(); @@ -177,7 +178,7 @@ jbyteArray Java_org_rocksdb_SstFileReaderIterator_key0(JNIEnv* env, * Signature: (J)[B */ jbyteArray Java_org_rocksdb_SstFileReaderIterator_value0(JNIEnv* env, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Slice value_slice = it->value(); @@ -200,7 +201,7 @@ jbyteArray Java_org_rocksdb_SstFileReaderIterator_value0(JNIEnv* env, * Signature: (JLjava/nio/ByteBuffer;II)I */ jint Java_org_rocksdb_SstFileReaderIterator_keyDirect0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget, + JNIEnv* env, jclass /*jcls*/, jlong handle, jobject jtarget, jint jtarget_off, jint jtarget_len) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Slice key_slice = it->key(); @@ -217,7 +218,7 @@ jint Java_org_rocksdb_SstFileReaderIterator_keyDirect0( * Signature: (J[BII)I */ jint Java_org_rocksdb_SstFileReaderIterator_keyByteArray0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jkey, jint jkey_off, + JNIEnv* env, jclass /*jcls*/, jlong handle, jbyteArray jkey, jint jkey_off, jint jkey_len) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Slice key_slice = it->key(); @@ -237,7 +238,7 @@ jint Java_org_rocksdb_SstFileReaderIterator_keyByteArray0( * Signature: (JLjava/nio/ByteBuffer;II)I */ jint Java_org_rocksdb_SstFileReaderIterator_valueDirect0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget, + JNIEnv* env, jclass /*jcls*/, jlong handle, jobject jtarget, jint jtarget_off, jint jtarget_len) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Slice value_slice = it->value(); @@ -254,7 +255,7 @@ jint Java_org_rocksdb_SstFileReaderIterator_valueDirect0( * Signature: (J[BII)I */ jint Java_org_rocksdb_SstFileReaderIterator_valueByteArray0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jvalue_target, + JNIEnv* env, jclass /*jcls*/, jlong handle, jbyteArray jvalue_target, jint jvalue_off, jint jvalue_len) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Slice value_slice = it->value(); @@ -273,8 +274,8 @@ jint Java_org_rocksdb_SstFileReaderIterator_valueByteArray0( * Method: seekDirect0 * Signature: (JLjava/nio/ByteBuffer;II)V */ -void Java_org_rocksdb_SstFileReaderIterator_seekDirect0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget, +void Java_org_rocksdb_SstFileReaderIterator_seekDirect0Jni( + JNIEnv* env, jclass /*cls*/, jlong handle, jobject jtarget, jint jtarget_off, jint jtarget_len) { auto* it = reinterpret_cast(handle); auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) { @@ -289,8 +290,8 @@ void Java_org_rocksdb_SstFileReaderIterator_seekDirect0( * Method: seekForPrevDirect0 * Signature: (JLjava/nio/ByteBuffer;II)V */ -void Java_org_rocksdb_SstFileReaderIterator_seekForPrevDirect0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget, +void Java_org_rocksdb_SstFileReaderIterator_seekForPrevDirect0Jni( + JNIEnv* env, jclass /*cls*/, jlong handle, jobject jtarget, jint jtarget_off, jint jtarget_len) { auto* it = reinterpret_cast(handle); auto seekPrev = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) { @@ -308,8 +309,8 @@ void Java_org_rocksdb_SstFileReaderIterator_seekForPrevDirect0( * Method: seekByteArray0 * Signature: (J[BII)V */ -void Java_org_rocksdb_SstFileReaderIterator_seekByteArray0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jtarget, +void Java_org_rocksdb_SstFileReaderIterator_seekByteArray0Jni( + JNIEnv* env, jclass /*cls*/, jlong handle, jbyteArray jtarget, jint jtarget_off, jint jtarget_len) { const std::unique_ptr target(new char[jtarget_len]); if (target == nullptr) { @@ -335,8 +336,8 @@ void Java_org_rocksdb_SstFileReaderIterator_seekByteArray0( * Method: seekForPrevByteArray0 * Signature: (J[BII)V */ -void Java_org_rocksdb_SstFileReaderIterator_seekForPrevByteArray0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jtarget, +void Java_org_rocksdb_SstFileReaderIterator_seekForPrevByteArray0Jni( + JNIEnv* env, jclass /*cls*/, jlong handle, jbyteArray jtarget, jint jtarget_off, jint jtarget_len) { const std::unique_ptr target(new char[jtarget_len]); if (target == nullptr) { @@ -359,9 +360,9 @@ void Java_org_rocksdb_SstFileReaderIterator_seekForPrevByteArray0( * Method: refresh0 * Signature: (J)V */ -void Java_org_rocksdb_SstFileReaderIterator_refresh0(JNIEnv* env, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_SstFileReaderIterator_refresh0Jni(JNIEnv* env, + jclass /*cls*/, + jlong handle) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Status s = it->Refresh(); @@ -371,3 +372,24 @@ void Java_org_rocksdb_SstFileReaderIterator_refresh0(JNIEnv* env, ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); } + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: refresh1 + * Signature: (JJ)V + */ +void Java_org_rocksdb_SstFileReaderIterator_refresh1(JNIEnv* env, + jobject /*jobj*/, + jlong handle, + jlong snapshot_handle) { + auto* it = reinterpret_cast(handle); + auto* snapshot = + reinterpret_cast(snapshot_handle); + ROCKSDB_NAMESPACE::Status s = it->Refresh(snapshot); + + if (s.ok()) { + return; + } + + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); +} diff --git a/java/rocksjni/sst_file_readerjni.cc b/java/rocksjni/sst_file_readerjni.cc index 7ef711842ac..4af472ecfb1 100644 --- a/java/rocksjni/sst_file_readerjni.cc +++ b/java/rocksjni/sst_file_readerjni.cc @@ -39,7 +39,7 @@ jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv * /*env*/, * Method: open * Signature: (JLjava/lang/String;)V */ -void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jobject /*jobj*/, +void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jclass /*jcls*/, jlong jhandle, jstring jfile_path) { const char *file_path = env->GetStringUTFChars(jfile_path, nullptr); if (file_path == nullptr) { @@ -62,8 +62,7 @@ void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jobject /*jobj*/, * Signature: (JJ)J */ jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv * /*env*/, - jobject /*jobj*/, - jlong jhandle, + jclass /*jcls*/, jlong jhandle, jlong jread_options_handle) { auto *sst_file_reader = reinterpret_cast(jhandle); @@ -77,9 +76,9 @@ jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv * /*env*/, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_SstFileReader_disposeInternal(JNIEnv * /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_SstFileReader_disposeInternalJni(JNIEnv * /*env*/, + jclass /*jcls*/, + jlong jhandle) { delete reinterpret_cast(jhandle); } @@ -88,8 +87,7 @@ void Java_org_rocksdb_SstFileReader_disposeInternal(JNIEnv * /*env*/, * Method: verifyChecksum * Signature: (J)V */ -void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env, - jobject /*jobj*/, +void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env, jclass /*jcls*/, jlong jhandle) { auto *sst_file_reader = reinterpret_cast(jhandle); @@ -105,7 +103,7 @@ void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env, * Signature: (J)J */ jobject Java_org_rocksdb_SstFileReader_getTableProperties(JNIEnv *env, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto *sst_file_reader = reinterpret_cast(jhandle); diff --git a/java/rocksjni/sst_file_writerjni.cc b/java/rocksjni/sst_file_writerjni.cc index 1898c3cfc93..481adbc8564 100644 --- a/java/rocksjni/sst_file_writerjni.cc +++ b/java/rocksjni/sst_file_writerjni.cc @@ -73,7 +73,7 @@ jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv * /*env*/, * Method: open * Signature: (JLjava/lang/String;)V */ -void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jobject /*jobj*/, +void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jclass /*jcls*/, jlong jhandle, jstring jfile_path) { const char *file_path = env->GetStringUTFChars(jfile_path, nullptr); if (file_path == nullptr) { @@ -95,7 +95,7 @@ void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jobject /*jobj*/, * Method: put * Signature: (JJJ)V */ -void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jobject /*jobj*/, +void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jclass /*jcls*/, jlong jhandle, jlong jkey_handle, jlong jvalue_handle) { auto *key_slice = reinterpret_cast(jkey_handle); @@ -114,7 +114,7 @@ void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jobject /*jobj*/, * Method: put * Signature: (JJJ)V */ -void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jobject /*jobj*/, +void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jclass /*jcls*/, jlong jhandle, jbyteArray jkey, jbyteArray jval) { jbyte *key = env->GetByteArrayElements(jkey, nullptr); @@ -151,7 +151,7 @@ void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jobject /*jobj*/, * Method: putDirect * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;II)V */ -void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv *env, jobject /*jdb*/, +void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv *env, jclass /*jcls*/, jlong jdb_handle, jobject jkey, jint jkey_off, jint jkey_len, jobject jval, jint jval_off, @@ -175,7 +175,7 @@ void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv *env, jobject /*jdb*/, * Method: fileSize * Signature: (J)J */ -jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv * /*env*/, jobject /*jdb*/, +jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv * /*env*/, jclass /*jcls*/, jlong jdb_handle) { auto *writer = reinterpret_cast(jdb_handle); @@ -187,7 +187,7 @@ jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv * /*env*/, jobject /*jdb*/, * Method: merge * Signature: (JJJ)V */ -void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jobject /*jobj*/, +void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jclass /*jcls*/, jlong jhandle, jlong jkey_handle, jlong jvalue_handle) { auto *key_slice = reinterpret_cast(jkey_handle); @@ -206,8 +206,7 @@ void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jobject /*jobj*/, * Method: merge * Signature: (J[B[B)V */ -void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env, - jobject /*jobj*/, +void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env, jclass /*jcls*/, jlong jhandle, jbyteArray jkey, jbyteArray jval) { @@ -245,7 +244,7 @@ void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env, * Method: delete * Signature: (JJJ)V */ -void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jobject /*jobj*/, +void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jclass /*jcls*/, jlong jhandle, jbyteArray jkey) { jbyte *key = env->GetByteArrayElements(jkey, nullptr); @@ -272,7 +271,7 @@ void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jobject /*jobj*/, * Method: delete * Signature: (JJJ)V */ -void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jobject /*jobj*/, +void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jclass /*jcls*/, jlong jhandle, jlong jkey_handle) { auto *key_slice = reinterpret_cast(jkey_handle); @@ -289,7 +288,7 @@ void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jobject /*jobj*/, * Method: finish * Signature: (J)V */ -void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jobject /*jobj*/, +void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jclass /*jcls*/, jlong jhandle) { ROCKSDB_NAMESPACE::Status s = reinterpret_cast(jhandle)->Finish(); @@ -303,8 +302,8 @@ void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jobject /*jobj*/, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_SstFileWriter_disposeInternal(JNIEnv * /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_SstFileWriter_disposeInternalJni(JNIEnv * /*env*/, + jclass /*jobj*/, + jlong jhandle) { delete reinterpret_cast(jhandle); } diff --git a/java/rocksjni/sst_partitioner.cc b/java/rocksjni/sst_partitioner.cc index 1cea3b0cb68..2ee94dd1ee5 100644 --- a/java/rocksjni/sst_partitioner.cc +++ b/java/rocksjni/sst_partitioner.cc @@ -35,8 +35,8 @@ jlong Java_org_rocksdb_SstPartitionerFixedPrefixFactory_newSstPartitionerFixedPr * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_SstPartitionerFixedPrefixFactory_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_SstPartitionerFixedPrefixFactory_disposeInternalJni( + JNIEnv*, jclass, jlong jhandle) { auto* ptr = reinterpret_cast< std::shared_ptr*>(jhandle); delete ptr; // delete std::shared_ptr diff --git a/java/rocksjni/statistics.cc b/java/rocksjni/statistics.cc index bd405afa119..d595086cff1 100644 --- a/java/rocksjni/statistics.cc +++ b/java/rocksjni/statistics.cc @@ -104,8 +104,8 @@ jlong Java_org_rocksdb_Statistics_newStatistics___3BJ( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_Statistics_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_Statistics_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { if (jhandle > 0) { auto* pSptr_statistics = reinterpret_cast*>( @@ -119,7 +119,7 @@ void Java_org_rocksdb_Statistics_disposeInternal(JNIEnv*, jobject, * Method: statsLevel * Signature: (J)B */ -jbyte Java_org_rocksdb_Statistics_statsLevel(JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_Statistics_statsLevel(JNIEnv*, jclass, jlong jhandle) { auto* pSptr_statistics = reinterpret_cast*>( jhandle); @@ -133,7 +133,7 @@ jbyte Java_org_rocksdb_Statistics_statsLevel(JNIEnv*, jobject, jlong jhandle) { * Method: setStatsLevel * Signature: (JB)V */ -void Java_org_rocksdb_Statistics_setStatsLevel(JNIEnv*, jobject, jlong jhandle, +void Java_org_rocksdb_Statistics_setStatsLevel(JNIEnv*, jclass, jlong jhandle, jbyte jstats_level) { auto* pSptr_statistics = reinterpret_cast*>( @@ -149,8 +149,7 @@ void Java_org_rocksdb_Statistics_setStatsLevel(JNIEnv*, jobject, jlong jhandle, * Method: getTickerCount * Signature: (JB)J */ -jlong Java_org_rocksdb_Statistics_getTickerCount(JNIEnv*, jobject, - jlong jhandle, +jlong Java_org_rocksdb_Statistics_getTickerCount(JNIEnv*, jclass, jlong jhandle, jbyte jticker_type) { auto* pSptr_statistics = reinterpret_cast*>( @@ -166,7 +165,7 @@ jlong Java_org_rocksdb_Statistics_getTickerCount(JNIEnv*, jobject, * Method: getAndResetTickerCount * Signature: (JB)J */ -jlong Java_org_rocksdb_Statistics_getAndResetTickerCount(JNIEnv*, jobject, +jlong Java_org_rocksdb_Statistics_getAndResetTickerCount(JNIEnv*, jclass, jlong jhandle, jbyte jticker_type) { auto* pSptr_statistics = @@ -182,7 +181,7 @@ jlong Java_org_rocksdb_Statistics_getAndResetTickerCount(JNIEnv*, jobject, * Method: getHistogramData * Signature: (JB)Lorg/rocksdb/HistogramData; */ -jobject Java_org_rocksdb_Statistics_getHistogramData(JNIEnv* env, jobject, +jobject Java_org_rocksdb_Statistics_getHistogramData(JNIEnv* env, jclass, jlong jhandle, jbyte jhistogram_type) { auto* pSptr_statistics = @@ -223,7 +222,7 @@ jobject Java_org_rocksdb_Statistics_getHistogramData(JNIEnv* env, jobject, * Method: getHistogramString * Signature: (JB)Ljava/lang/String; */ -jstring Java_org_rocksdb_Statistics_getHistogramString(JNIEnv* env, jobject, +jstring Java_org_rocksdb_Statistics_getHistogramString(JNIEnv* env, jclass, jlong jhandle, jbyte jhistogram_type) { auto* pSptr_statistics = @@ -241,7 +240,7 @@ jstring Java_org_rocksdb_Statistics_getHistogramString(JNIEnv* env, jobject, * Method: reset * Signature: (J)V */ -void Java_org_rocksdb_Statistics_reset(JNIEnv* env, jobject, jlong jhandle) { +void Java_org_rocksdb_Statistics_reset(JNIEnv* env, jclass, jlong jhandle) { auto* pSptr_statistics = reinterpret_cast*>( jhandle); @@ -257,7 +256,7 @@ void Java_org_rocksdb_Statistics_reset(JNIEnv* env, jobject, jlong jhandle) { * Method: toString * Signature: (J)Ljava/lang/String; */ -jstring Java_org_rocksdb_Statistics_toString(JNIEnv* env, jobject, +jstring Java_org_rocksdb_Statistics_toString(JNIEnv* env, jclass, jlong jhandle) { auto* pSptr_statistics = reinterpret_cast*>( diff --git a/java/rocksjni/statisticsjni.h b/java/rocksjni/statisticsjni.h index 3262b296cf5..d22849fc3d7 100644 --- a/java/rocksjni/statisticsjni.h +++ b/java/rocksjni/statisticsjni.h @@ -23,7 +23,7 @@ class StatisticsJni : public StatisticsImpl { StatisticsJni(std::shared_ptr stats); StatisticsJni(std::shared_ptr stats, const std::set ignore_histograms); - virtual bool HistEnabledForType(uint32_t type) const override; + bool HistEnabledForType(uint32_t type) const override; private: const std::set m_ignore_histograms; diff --git a/java/rocksjni/stderr_logger.cc b/java/rocksjni/stderr_logger.cc new file mode 100644 index 00000000000..22da70c2e86 --- /dev/null +++ b/java/rocksjni/stderr_logger.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/stderr_logger.h" + +#include + +#include + +#include "include/org_rocksdb_util_StdErrLogger.h" +#include "rocksjni/cplusplus_to_java_convert.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_util_StdErrLogger + * Method: newStdErrLogger + * Signature: (BLjava/lang/String;)J + */ +jlong Java_org_rocksdb_util_StdErrLogger_newStdErrLogger(JNIEnv* env, + jclass /*jcls*/, + jbyte jlog_level, + jstring jlog_prefix) { + auto log_level = static_cast(jlog_level); + std::shared_ptr* sptr_logger = nullptr; + if (jlog_prefix == nullptr) { + sptr_logger = new std::shared_ptr( + new ROCKSDB_NAMESPACE::StderrLogger(log_level)); + } else { + jboolean has_exception = JNI_FALSE; + auto log_prefix = ROCKSDB_NAMESPACE::JniUtil::copyStdString( + env, jlog_prefix, &has_exception); // also releases jlog_prefix + if (has_exception == JNI_TRUE) { + return 0; + } + sptr_logger = new std::shared_ptr( + new ROCKSDB_NAMESPACE::StderrLogger(log_level, log_prefix)); + } + return GET_CPLUSPLUS_POINTER(sptr_logger); +} + +/* + * Class: org_rocksdb_util_StdErrLogger + * Method: setInfoLogLevel + * Signature: (JB)V + */ +void Java_org_rocksdb_util_StdErrLogger_setInfoLogLevel(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jhandle, + jbyte jlog_level) { + auto* handle = + reinterpret_cast*>( + jhandle); + handle->get()->SetInfoLogLevel( + static_cast(jlog_level)); +} + +/* + * Class: org_rocksdb_util_StdErrLogger + * Method: infoLogLevel + * Signature: (J)B + */ +jbyte Java_org_rocksdb_util_StdErrLogger_infoLogLevel(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jhandle) { + auto* handle = + reinterpret_cast*>( + jhandle); + return static_cast(handle->get()->GetInfoLogLevel()); +} + +/* + * Class: org_rocksdb_util_StdErrLogger + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_util_StdErrLogger_disposeInternal(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong jhandle) { + auto* handle = + reinterpret_cast*>( + jhandle); + delete handle; // delete std::shared_ptr +} diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc index 7f99900e4cb..eb5de1695e6 100644 --- a/java/rocksjni/table.cc +++ b/java/rocksjni/table.cc @@ -23,9 +23,9 @@ * Signature: (IIDIIBZZ)J */ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle( - JNIEnv * /*env*/, jobject /*jobj*/, jint jkey_size, - jint jbloom_bits_per_key, jdouble jhash_table_ratio, jint jindex_sparseness, - jint jhuge_page_tlb_size, jbyte jencoding_type, jboolean jfull_scan_mode, + JNIEnv * /*env*/, jclass /*jcls*/, jint jkey_size, jint jbloom_bits_per_key, + jdouble jhash_table_ratio, jint jindex_sparseness, jint jhuge_page_tlb_size, + jbyte jencoding_type, jboolean jfull_scan_mode, jboolean jstore_index_in_file) { ROCKSDB_NAMESPACE::PlainTableOptions options = ROCKSDB_NAMESPACE::PlainTableOptions(); @@ -48,7 +48,7 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle( * Signature: (ZZZZBBDBZJJJJIIIJZZZJZZIIZZBJIJI)J */ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( - JNIEnv *, jobject, jboolean jcache_index_and_filter_blocks, + JNIEnv *, jclass, jboolean jcache_index_and_filter_blocks, jboolean jcache_index_and_filter_blocks_with_high_priority, jboolean jpin_l0_filter_and_index_blocks_in_cache, jboolean jpin_top_level_index_and_filter, jbyte jindex_type_value, diff --git a/java/rocksjni/table_properties_collector_factory.cc b/java/rocksjni/table_properties_collector_factory.cc new file mode 100644 index 00000000000..60e1df6e8b1 --- /dev/null +++ b/java/rocksjni/table_properties_collector_factory.cc @@ -0,0 +1,39 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "java/rocksjni/table_properties_collector_factory.h" + +#include "java/include/org_rocksdb_TablePropertiesCollectorFactory.h" +#include "java/rocksjni/cplusplus_to_java_convert.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/table_properties_collectors.h" + +/* + * Class: org_rocksdb_TablePropertiesCollectorFactory + * Method: newCompactOnDeletionCollectorFactory + * Signature: (JJD)J + */ +jlong Java_org_rocksdb_TablePropertiesCollectorFactory_newCompactOnDeletionCollectorFactory( + JNIEnv *, jclass, jlong sliding_window_size, jlong deletion_trigger, + jdouble deletion_ratio) { + auto *wrapper = new TablePropertiesCollectorFactoriesJniWrapper(); + wrapper->table_properties_collector_factories = + ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory( + sliding_window_size, deletion_trigger, deletion_ratio); + return GET_CPLUSPLUS_POINTER(wrapper); +} + +/* + * Class: org_rocksdb_TablePropertiesCollectorFactory + * Method: deleteCompactOnDeletionCollectorFactory + * Signature: (J)J + */ +void Java_org_rocksdb_TablePropertiesCollectorFactory_deleteCompactOnDeletionCollectorFactory( + JNIEnv *, jclass, jlong jhandle) { + auto instance = + reinterpret_cast(jhandle); + delete instance; +} diff --git a/java/rocksjni/table_properties_collector_factory.h b/java/rocksjni/table_properties_collector_factory.h new file mode 100644 index 00000000000..411c218c30c --- /dev/null +++ b/java/rocksjni/table_properties_collector_factory.h @@ -0,0 +1,17 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "rocksdb/table_properties.h" +#include "rocksdb/utilities/table_properties_collectors.h" + +#ifndef ROCKSDB_TABLE_PROPERTIES_COLLECTOR_FACTORY_H +#define ROCKSDB_TABLE_PROPERTIES_COLLECTOR_FACTORY_H + +struct TablePropertiesCollectorFactoriesJniWrapper { + std::shared_ptr + table_properties_collector_factories; +}; +#endif // ROCKSDB_TABLE_PROPERTIES_COLLECTOR_FACTORY_H diff --git a/java/rocksjni/testable_event_listener.cc b/java/rocksjni/testable_event_listener.cc index 71188bc3c6c..483ade16056 100644 --- a/java/rocksjni/testable_event_listener.cc +++ b/java/rocksjni/testable_event_listener.cc @@ -201,7 +201,7 @@ void Java_org_rocksdb_test_TestableEventListener_invokeAllCallbacks( FileOperationInfo(FileOperationType::kRead, file_path, start_timestamp, finish_timestamp, status); op_info.offset = UINT64_MAX; - op_info.length = SIZE_MAX; + op_info.length = 4096; el->OnFileReadFinish(op_info); el->OnFileWriteFinish(op_info); diff --git a/java/rocksjni/transaction.cc b/java/rocksjni/transaction.cc index 1a0a64fc7f0..e211ebe5d6d 100644 --- a/java/rocksjni/transaction.cc +++ b/java/rocksjni/transaction.cc @@ -14,6 +14,8 @@ #include "include/org_rocksdb_Transaction.h" #include "rocksjni/cplusplus_to_java_convert.h" +#include "rocksjni/jni_multiget_helpers.h" +#include "rocksjni/kv_helper.h" #include "rocksjni/portal.h" #if defined(_MSC_VER) @@ -27,7 +29,7 @@ * Method: setSnapshot * Signature: (J)V */ -void Java_org_rocksdb_Transaction_setSnapshot(JNIEnv* /*env*/, jobject /*jobj*/, +void Java_org_rocksdb_Transaction_setSnapshot(JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); txn->SetSnapshot(); @@ -38,8 +40,9 @@ void Java_org_rocksdb_Transaction_setSnapshot(JNIEnv* /*env*/, jobject /*jobj*/, * Method: setSnapshotOnNextOperation * Signature: (J)V */ -void Java_org_rocksdb_Transaction_setSnapshotOnNextOperation__J( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { +void Java_org_rocksdb_Transaction_setSnapshotOnNextOperation__J(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jhandle) { auto* txn = reinterpret_cast(jhandle); txn->SetSnapshotOnNextOperation(nullptr); } @@ -50,7 +53,7 @@ void Java_org_rocksdb_Transaction_setSnapshotOnNextOperation__J( * Signature: (JJ)V */ void Java_org_rocksdb_Transaction_setSnapshotOnNextOperation__JJ( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jtxn_notifier_handle) { auto* txn = reinterpret_cast(jhandle); auto* txn_notifier = reinterpret_cast< @@ -64,8 +67,7 @@ void Java_org_rocksdb_Transaction_setSnapshotOnNextOperation__JJ( * Method: getSnapshot * Signature: (J)J */ -jlong Java_org_rocksdb_Transaction_getSnapshot(JNIEnv* /*env*/, - jobject /*jobj*/, +jlong Java_org_rocksdb_Transaction_getSnapshot(JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); const ROCKSDB_NAMESPACE::Snapshot* snapshot = txn->GetSnapshot(); @@ -78,7 +80,7 @@ jlong Java_org_rocksdb_Transaction_getSnapshot(JNIEnv* /*env*/, * Signature: (J)V */ void Java_org_rocksdb_Transaction_clearSnapshot(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); txn->ClearSnapshot(); @@ -89,7 +91,7 @@ void Java_org_rocksdb_Transaction_clearSnapshot(JNIEnv* /*env*/, * Method: prepare * Signature: (J)V */ -void Java_org_rocksdb_Transaction_prepare(JNIEnv* env, jobject /*jobj*/, +void Java_org_rocksdb_Transaction_prepare(JNIEnv* env, jclass /*jobj*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::Status s = txn->Prepare(); @@ -103,7 +105,7 @@ void Java_org_rocksdb_Transaction_prepare(JNIEnv* env, jobject /*jobj*/, * Method: commit * Signature: (J)V */ -void Java_org_rocksdb_Transaction_commit(JNIEnv* env, jobject /*jobj*/, +void Java_org_rocksdb_Transaction_commit(JNIEnv* env, jclass /*jobj*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::Status s = txn->Commit(); @@ -117,7 +119,7 @@ void Java_org_rocksdb_Transaction_commit(JNIEnv* env, jobject /*jobj*/, * Method: rollback * Signature: (J)V */ -void Java_org_rocksdb_Transaction_rollback(JNIEnv* env, jobject /*jobj*/, +void Java_org_rocksdb_Transaction_rollback(JNIEnv* env, jclass /*jobj*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::Status s = txn->Rollback(); @@ -131,8 +133,7 @@ void Java_org_rocksdb_Transaction_rollback(JNIEnv* env, jobject /*jobj*/, * Method: setSavePoint * Signature: (J)V */ -void Java_org_rocksdb_Transaction_setSavePoint(JNIEnv* /*env*/, - jobject /*jobj*/, +void Java_org_rocksdb_Transaction_setSavePoint(JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); txn->SetSavePoint(); @@ -144,7 +145,7 @@ void Java_org_rocksdb_Transaction_setSavePoint(JNIEnv* /*env*/, * Signature: (J)V */ void Java_org_rocksdb_Transaction_rollbackToSavePoint(JNIEnv* env, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::Status s = txn->RollbackToSavePoint(); @@ -158,115 +159,113 @@ typedef std::function FnGet; -// TODO(AR) consider refactoring to share this between here and rocksjni.cc -jbyteArray txn_get_helper(JNIEnv* env, const FnGet& fn_get, - const jlong& jread_options_handle, - const jbyteArray& jkey, const jint& jkey_part_len) { - jbyte* key = env->GetByteArrayElements(jkey, nullptr); - if (key == nullptr) { - // exception thrown: OutOfMemoryError +/* + * Class: org_rocksdb_Transaction + * Method: get + * Signature: (JJ[BIIJ)[B + */ +jbyteArray Java_org_rocksdb_Transaction_get__JJ_3BIIJ( + JNIEnv* env, jclass, jlong jhandle, jlong jread_options_handle, + jbyteArray jkey, jint jkey_off, jint jkey_part_len, + jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* read_options = + reinterpret_cast(jread_options_handle); + auto* column_family_handle = + reinterpret_cast( + jcolumn_family_handle); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_part_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Get(*read_options, column_family_handle, key.slice(), + &value.pinnable_slice())); + return value.NewByteArray(); + } catch (ROCKSDB_NAMESPACE::KVException&) { return nullptr; } - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), - jkey_part_len); +} +/* + * Class: org_rocksdb_Transaction + * Method: get + * Signature: (JJ[BII)[B + */ +jbyteArray Java_org_rocksdb_Transaction_get__JJ_3BII( + JNIEnv* env, jclass, jlong jhandle, jlong jread_options_handle, + jbyteArray jkey, jint jkey_off, jint jkey_part_len) { + auto* txn = reinterpret_cast(jhandle); auto* read_options = reinterpret_cast(jread_options_handle); - std::string value; - ROCKSDB_NAMESPACE::Status s = fn_get(*read_options, key_slice, &value); - - // trigger java unref on key. - // by passing JNI_ABORT, it will simply release the reference without - // copying the result back to the java byte array. - env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); - - if (s.IsNotFound()) { + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_part_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Get(*read_options, key.slice(), &value.pinnable_slice())); + return value.NewByteArray(); + } catch (ROCKSDB_NAMESPACE::KVException&) { return nullptr; } - - if (s.ok()) { - jbyteArray jret_value = env->NewByteArray(static_cast(value.size())); - if (jret_value == nullptr) { - // exception thrown: OutOfMemoryError - return nullptr; - } - env->SetByteArrayRegion( - jret_value, 0, static_cast(value.size()), - const_cast(reinterpret_cast(value.c_str()))); - if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - return nullptr; - } - return jret_value; - } - - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); - return nullptr; } /* * Class: org_rocksdb_Transaction * Method: get - * Signature: (JJ[BIJ)[B + * Signature: (JJ[BII[BIIJ)I */ -jbyteArray Java_org_rocksdb_Transaction_get__JJ_3BIJ( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle, - jbyteArray jkey, jint jkey_part_len, jlong jcolumn_family_handle) { +jint Java_org_rocksdb_Transaction_get__JJ_3BII_3BIIJ( + JNIEnv* env, jclass, jlong jhandle, jlong jread_options_handle, + jbyteArray jkey, jint jkey_off, jint jkey_part_len, jbyteArray jval, + jint jval_off, jint jval_part_len, jlong jcolumn_family_handle) { auto* txn = reinterpret_cast(jhandle); + auto* read_options = + reinterpret_cast(jread_options_handle); auto* column_family_handle = reinterpret_cast( jcolumn_family_handle); - FnGet fn_get = - std::bind( - &ROCKSDB_NAMESPACE::Transaction::Get, txn, std::placeholders::_1, - column_family_handle, std::placeholders::_2, std::placeholders::_3); - return txn_get_helper(env, fn_get, jread_options_handle, jkey, jkey_part_len); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_part_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off, + jval_part_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Get(*read_options, column_family_handle, key.slice(), + &value.pinnable_slice())); + return value.Fetch(); + } catch (ROCKSDB_NAMESPACE::KVException& e) { + return e.Code(); + } } /* * Class: org_rocksdb_Transaction - * Method: get - * Signature: (JJ[BI)[B + * Method: getDirect + * Signature: (JJLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJ)I */ -jbyteArray Java_org_rocksdb_Transaction_get__JJ_3BI( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle, - jbyteArray jkey, jint jkey_part_len) { +jint Java_org_rocksdb_Transaction_getDirect(JNIEnv* env, jclass, jlong jhandle, + jlong jread_options_handle, + jobject jkey_bb, jint jkey_off, + jint jkey_part_len, jobject jval_bb, + jint jval_off, jint jval_part_len, + jlong jcolumn_family_handle) { auto* txn = reinterpret_cast(jhandle); - FnGet fn_get = - std::bind( - &ROCKSDB_NAMESPACE::Transaction::Get, txn, std::placeholders::_1, - std::placeholders::_2, std::placeholders::_3); - return txn_get_helper(env, fn_get, jread_options_handle, jkey, jkey_part_len); -} + auto* read_options = + reinterpret_cast(jread_options_handle); + auto* column_family_handle = + reinterpret_cast( + jcolumn_family_handle); -// TODO(AR) consider refactoring to share this between here and rocksjni.cc -// used by txn_multi_get_helper below -std::vector txn_column_families_helper( - JNIEnv* env, jlongArray jcolumn_family_handles, bool* has_exception) { - std::vector cf_handles; - if (jcolumn_family_handles != nullptr) { - const jsize len_cols = env->GetArrayLength(jcolumn_family_handles); - if (len_cols > 0) { - jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr); - if (jcfh == nullptr) { - // exception thrown: OutOfMemoryError - *has_exception = JNI_TRUE; - return std::vector(); - } - for (int i = 0; i < len_cols; i++) { - auto* cf_handle = - reinterpret_cast(jcfh[i]); - cf_handles.push_back(cf_handle); - } - env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT); - } + try { + ROCKSDB_NAMESPACE::JDirectBufferSlice key(env, jkey_bb, jkey_off, + jkey_part_len); + ROCKSDB_NAMESPACE::JDirectBufferPinnableSlice value(env, jval_bb, jval_off, + jval_part_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Get(*read_options, column_family_handle, key.slice(), + &value.pinnable_slice())); + return value.Fetch(); + } catch (const ROCKSDB_NAMESPACE::KVException& e) { + return e.Code(); } - return cf_handles; } typedef std::function( @@ -293,117 +292,30 @@ void free_key_values(std::vector& keys_to_free) { } } -// TODO(AR) consider refactoring to share this between here and rocksjni.cc -// cf multi get -jobjectArray txn_multi_get_helper(JNIEnv* env, const FnMultiGet& fn_multi_get, - const jlong& jread_options_handle, - const jobjectArray& jkey_parts) { - const jsize len_key_parts = env->GetArrayLength(jkey_parts); - - std::vector key_parts; - std::vector keys_to_free; - for (int i = 0; i < len_key_parts; i++) { - const jobject jk = env->GetObjectArrayElement(jkey_parts, i); - if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - free_key_values(keys_to_free); - return nullptr; - } - jbyteArray jk_ba = reinterpret_cast(jk); - const jsize len_key = env->GetArrayLength(jk_ba); - jbyte* jk_val = new jbyte[len_key]; - if (jk_val == nullptr) { - // exception thrown: OutOfMemoryError - env->DeleteLocalRef(jk); - free_key_values(keys_to_free); - - jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); - (env)->ThrowNew(exception_cls, - "Insufficient Memory for CF handle array."); - return nullptr; - } - env->GetByteArrayRegion(jk_ba, 0, len_key, jk_val); - - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(jk_val), - len_key); - key_parts.push_back(key_slice); - keys_to_free.push_back(jk_val); - env->DeleteLocalRef(jk); - } - - auto* read_options = - reinterpret_cast(jread_options_handle); - std::vector value_parts; - std::vector s = - fn_multi_get(*read_options, key_parts, &value_parts); - - // free up allocated byte arrays - free_key_values(keys_to_free); - - // prepare the results - const jclass jcls_ba = env->FindClass("[B"); - jobjectArray jresults = - env->NewObjectArray(static_cast(s.size()), jcls_ba, nullptr); - if (jresults == nullptr) { - // exception thrown: OutOfMemoryError - return nullptr; - } - - // add to the jresults - for (std::vector::size_type i = 0; i != s.size(); - i++) { - if (s[i].ok()) { - jbyteArray jentry_value = - env->NewByteArray(static_cast(value_parts[i].size())); - if (jentry_value == nullptr) { - // exception thrown: OutOfMemoryError - return nullptr; - } - - env->SetByteArrayRegion( - jentry_value, 0, static_cast(value_parts[i].size()), - const_cast( - reinterpret_cast(value_parts[i].c_str()))); - if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - env->DeleteLocalRef(jentry_value); - return nullptr; - } - - env->SetObjectArrayElement(jresults, static_cast(i), jentry_value); - env->DeleteLocalRef(jentry_value); - } - } - - return jresults; -} - /* * Class: org_rocksdb_Transaction * Method: multiGet * Signature: (JJ[[B[J)[[B */ jobjectArray Java_org_rocksdb_Transaction_multiGet__JJ_3_3B_3J( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jlong jread_options_handle, jobjectArray jkey_parts, jlongArray jcolumn_family_handles) { - bool has_exception = false; - const std::vector - column_family_handles = txn_column_families_helper( - env, jcolumn_family_handles, &has_exception); - if (has_exception) { - // exception thrown: OutOfMemoryError + ROCKSDB_NAMESPACE::MultiGetJNIKeys keys; + if (!keys.fromByteArrays(env, jkey_parts)) { return nullptr; } + auto cf_handles = + ROCKSDB_NAMESPACE::ColumnFamilyJNIHelpers::handlesFromJLongArray( + env, jcolumn_family_handles); + if (!cf_handles) return nullptr; auto* txn = reinterpret_cast(jhandle); - FnMultiGet fn_multi_get = std::bind ( - ROCKSDB_NAMESPACE::Transaction::*)( - const ROCKSDB_NAMESPACE::ReadOptions&, - const std::vector&, - const std::vector&, std::vector*)>( - &ROCKSDB_NAMESPACE::Transaction::MultiGet, txn, std::placeholders::_1, - column_family_handles, std::placeholders::_2, std::placeholders::_3); - return txn_multi_get_helper(env, fn_multi_get, jread_options_handle, - jkey_parts); + std::vector values(keys.size()); + std::vector statuses = txn->MultiGet( + *reinterpret_cast(jread_options_handle), + *cf_handles, keys.slices(), &values); + + return ROCKSDB_NAMESPACE::MultiGetJNIValues::byteArrays(env, values, + statuses); } /* @@ -412,63 +324,141 @@ jobjectArray Java_org_rocksdb_Transaction_multiGet__JJ_3_3B_3J( * Signature: (JJ[[B)[[B */ jobjectArray Java_org_rocksdb_Transaction_multiGet__JJ_3_3B( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jlong jread_options_handle, jobjectArray jkey_parts) { + ROCKSDB_NAMESPACE::MultiGetJNIKeys keys; + if (!keys.fromByteArrays(env, jkey_parts)) { + return nullptr; + } + auto* txn = reinterpret_cast(jhandle); - FnMultiGet fn_multi_get = std::bind ( - ROCKSDB_NAMESPACE::Transaction::*)( - const ROCKSDB_NAMESPACE::ReadOptions&, - const std::vector&, std::vector*)>( - &ROCKSDB_NAMESPACE::Transaction::MultiGet, txn, std::placeholders::_1, - std::placeholders::_2, std::placeholders::_3); - return txn_multi_get_helper(env, fn_multi_get, jread_options_handle, - jkey_parts); + std::vector values(keys.size()); + std::vector statuses = txn->MultiGet( + *reinterpret_cast(jread_options_handle), + keys.slices(), &values); + + return ROCKSDB_NAMESPACE::MultiGetJNIValues::byteArrays(env, values, + statuses); } /* * Class: org_rocksdb_Transaction * Method: getForUpdate - * Signature: (JJ[BIJZZ)[B + * Signature: (JJ[BIIJZZ)[B */ -jbyteArray Java_org_rocksdb_Transaction_getForUpdate__JJ_3BIJZZ( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle, - jbyteArray jkey, jint jkey_part_len, jlong jcolumn_family_handle, - jboolean jexclusive, jboolean jdo_validate) { +jbyteArray Java_org_rocksdb_Transaction_getForUpdate__JJ_3BIIJZZ( + JNIEnv* env, jclass, jlong jhandle, jlong jread_options_handle, + jbyteArray jkey, jint jkey_off, jint jkey_part_len, + jlong jcolumn_family_handle, jboolean jexclusive, jboolean jdo_validate) { + auto* read_options = + reinterpret_cast(jread_options_handle); auto* column_family_handle = reinterpret_cast( jcolumn_family_handle); auto* txn = reinterpret_cast(jhandle); - FnGet fn_get_for_update = - std::bind( - &ROCKSDB_NAMESPACE::Transaction::GetForUpdate, txn, - std::placeholders::_1, column_family_handle, std::placeholders::_2, - std::placeholders::_3, jexclusive, jdo_validate); - return txn_get_helper(env, fn_get_for_update, jread_options_handle, jkey, - jkey_part_len); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_part_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, + txn->GetForUpdate(*read_options, column_family_handle, key.slice(), + &value.pinnable_slice(), jexclusive, jdo_validate)); + return value.NewByteArray(); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return nullptr; + } } /* * Class: org_rocksdb_Transaction * Method: getForUpdate - * Signature: (JJ[BIZZ)[B + * Signature: (JJ[BII[BIIJZZ)I + */ +jint Java_org_rocksdb_Transaction_getForUpdate__JJ_3BII_3BIIJZZ( + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jlong jread_options_handle, + jbyteArray jkey, jint jkey_off, jint jkey_part_len, jbyteArray jval, + jint jval_off, jint jval_len, jlong jcolumn_family_handle, + jboolean jexclusive, jboolean jdo_validate) { + auto* read_options = + reinterpret_cast(jread_options_handle); + auto* column_family_handle = + reinterpret_cast( + jcolumn_family_handle); + auto* txn = reinterpret_cast(jhandle); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_part_len); + ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off, + jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, + txn->GetForUpdate(*read_options, column_family_handle, key.slice(), + &value.pinnable_slice(), jexclusive, jdo_validate)); + return value.Fetch(); + } catch (ROCKSDB_NAMESPACE::KVException& e) { + return e.Code(); + } +} + +/* + * Class: org_rocksdb_Transaction + * Method: getDirect + * Signature: (JJLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJZZ)I */ -jbyteArray Java_org_rocksdb_Transaction_getForUpdate__JJ_3BIZZ( +jint Java_org_rocksdb_Transaction_getDirect( JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle, - jbyteArray jkey, jint jkey_part_len, jboolean jexclusive, - jboolean jdo_validate) { + jobject jkey_bb, jint jkey_off, jint jkey_part_len, jobject jval_bb, + jint jval_off, jint jval_len, jlong jcolumn_family_handle) { auto* txn = reinterpret_cast(jhandle); - FnGet fn_get_for_update = - std::bind( - &ROCKSDB_NAMESPACE::Transaction::GetForUpdate, txn, - std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, - jexclusive, jdo_validate); - return txn_get_helper(env, fn_get_for_update, jread_options_handle, jkey, - jkey_part_len); + auto* read_options = + reinterpret_cast(jread_options_handle); + auto* column_family_handle = + reinterpret_cast( + jcolumn_family_handle); + + try { + ROCKSDB_NAMESPACE::JDirectBufferSlice key(env, jkey_bb, jkey_off, + jkey_part_len); + ROCKSDB_NAMESPACE::JDirectBufferPinnableSlice value(env, jval_bb, jval_off, + jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Get(*read_options, column_family_handle, key.slice(), + &value.pinnable_slice())); + return value.Fetch(); + } catch (const ROCKSDB_NAMESPACE::KVException& e) { + return e.Code(); + } +} + +/* + * Class: org_rocksdb_Transaction + * Method: getDirectForUpdate + * Signature: (JJLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJZZ)I + */ +jint Java_org_rocksdb_Transaction_getDirectForUpdate( + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jlong jread_options_handle, + jobject jkey_bb, jint jkey_off, jint jkey_part_len, jobject jval_bb, + jint jval_off, jint jval_len, jlong jcolumn_family_handle, + jboolean jexclusive, jboolean jdo_validate) { + auto* txn = reinterpret_cast(jhandle); + auto* read_options = + reinterpret_cast(jread_options_handle); + auto* column_family_handle = + reinterpret_cast( + jcolumn_family_handle); + + try { + ROCKSDB_NAMESPACE::JDirectBufferSlice key(env, jkey_bb, jkey_off, + jkey_part_len); + ROCKSDB_NAMESPACE::JDirectBufferPinnableSlice value(env, jval_bb, jval_off, + jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, + txn->GetForUpdate(*read_options, column_family_handle, key.slice(), + &value.pinnable_slice(), jexclusive, jdo_validate)); + return value.Fetch(); + } catch (const ROCKSDB_NAMESPACE::KVException& e) { + return e.Code(); + } } /* @@ -477,27 +467,24 @@ jbyteArray Java_org_rocksdb_Transaction_getForUpdate__JJ_3BIZZ( * Signature: (JJ[[B[J)[[B */ jobjectArray Java_org_rocksdb_Transaction_multiGetForUpdate__JJ_3_3B_3J( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle, + JNIEnv* env, jclass, jlong jhandle, jlong jread_options_handle, jobjectArray jkey_parts, jlongArray jcolumn_family_handles) { - bool has_exception = false; - const std::vector - column_family_handles = txn_column_families_helper( - env, jcolumn_family_handles, &has_exception); - if (has_exception) { - // exception thrown: OutOfMemoryError + ROCKSDB_NAMESPACE::MultiGetJNIKeys keys; + if (!keys.fromByteArrays(env, jkey_parts)) { return nullptr; } + auto cf_handles = + ROCKSDB_NAMESPACE::ColumnFamilyJNIHelpers::handlesFromJLongArray( + env, jcolumn_family_handles); + if (!cf_handles) return nullptr; auto* txn = reinterpret_cast(jhandle); - FnMultiGet fn_multi_get_for_update = std::bind (ROCKSDB_NAMESPACE::Transaction::*)( - const ROCKSDB_NAMESPACE::ReadOptions&, - const std::vector&, - const std::vector&, std::vector*)>( - &ROCKSDB_NAMESPACE::Transaction::MultiGetForUpdate, txn, - std::placeholders::_1, column_family_handles, std::placeholders::_2, - std::placeholders::_3); - return txn_multi_get_helper(env, fn_multi_get_for_update, - jread_options_handle, jkey_parts); + std::vector values(keys.size()); + std::vector statuses = txn->MultiGetForUpdate( + *reinterpret_cast(jread_options_handle), + *cf_handles, keys.slices(), &values); + + return ROCKSDB_NAMESPACE::MultiGetJNIValues::byteArrays(env, values, + statuses); } /* @@ -506,32 +493,21 @@ jobjectArray Java_org_rocksdb_Transaction_multiGetForUpdate__JJ_3_3B_3J( * Signature: (JJ[[B)[[B */ jobjectArray Java_org_rocksdb_Transaction_multiGetForUpdate__JJ_3_3B( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jlong jread_options_handle, jobjectArray jkey_parts) { - auto* txn = reinterpret_cast(jhandle); - FnMultiGet fn_multi_get_for_update = std::bind (ROCKSDB_NAMESPACE::Transaction::*)( - const ROCKSDB_NAMESPACE::ReadOptions&, - const std::vector&, std::vector*)>( - &ROCKSDB_NAMESPACE::Transaction::MultiGetForUpdate, txn, - std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); - return txn_multi_get_helper(env, fn_multi_get_for_update, - jread_options_handle, jkey_parts); -} + ROCKSDB_NAMESPACE::MultiGetJNIKeys keys; + if (!keys.fromByteArrays(env, jkey_parts)) { + return nullptr; + } -/* - * Class: org_rocksdb_Transaction - * Method: getIterator - * Signature: (JJ)J - */ -jlong Java_org_rocksdb_Transaction_getIterator__JJ(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle, - jlong jread_options_handle) { auto* txn = reinterpret_cast(jhandle); - auto* read_options = - reinterpret_cast(jread_options_handle); - return GET_CPLUSPLUS_POINTER(txn->GetIterator(*read_options)); + std::vector values(keys.size()); + std::vector statuses = txn->MultiGetForUpdate( + *reinterpret_cast(jread_options_handle), + keys.slices(), &values); + + return ROCKSDB_NAMESPACE::MultiGetJNIValues::byteArrays(env, values, + statuses); } /* @@ -539,9 +515,10 @@ jlong Java_org_rocksdb_Transaction_getIterator__JJ(JNIEnv* /*env*/, * Method: getIterator * Signature: (JJJ)J */ -jlong Java_org_rocksdb_Transaction_getIterator__JJJ( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, - jlong jread_options_handle, jlong jcolumn_family_handle) { +jlong Java_org_rocksdb_Transaction_getIterator(JNIEnv* /*env*/, jclass, + jlong jhandle, + jlong jread_options_handle, + jlong jcolumn_family_handle) { auto* txn = reinterpret_cast(jhandle); auto* read_options = reinterpret_cast(jread_options_handle); @@ -552,84 +529,92 @@ jlong Java_org_rocksdb_Transaction_getIterator__JJJ( txn->GetIterator(*read_options, column_family_handle)); } -typedef std::function - FnWriteKV; - -// TODO(AR) consider refactoring to share this between here and rocksjni.cc -void txn_write_kv_helper(JNIEnv* env, const FnWriteKV& fn_write_kv, - const jbyteArray& jkey, const jint& jkey_part_len, - const jbyteArray& jval, const jint& jval_len) { - jbyte* key = env->GetByteArrayElements(jkey, nullptr); - if (key == nullptr) { - // exception thrown: OutOfMemoryError - return; - } - jbyte* value = env->GetByteArrayElements(jval, nullptr); - if (value == nullptr) { - // exception thrown: OutOfMemoryError - env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); +/* + * Class: org_rocksdb_Transaction + * Method: put + * Signature: (J[BII[BIIJZ)V + */ +void Java_org_rocksdb_Transaction_put__J_3BII_3BIIJZ( + JNIEnv* env, jclass, jlong jhandle, jbyteArray jkey, jint jkey_off, + jint jkey_part_len, jbyteArray jval, jint jval_off, jint jval_len, + jlong jcolumn_family_handle, jboolean jassume_tracked) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast( + jcolumn_family_handle); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_part_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Put(column_family_handle, key.slice(), value.slice(), + jassume_tracked)); + } catch (ROCKSDB_NAMESPACE::KVException&) { return; } - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), - jkey_part_len); - ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast(value), - jval_len); - - ROCKSDB_NAMESPACE::Status s = fn_write_kv(key_slice, value_slice); - - // trigger java unref on key. - // by passing JNI_ABORT, it will simply release the reference without - // copying the result back to the java byte array. - env->ReleaseByteArrayElements(jval, value, JNI_ABORT); - env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); +} - if (s.ok()) { +/* + * Class: org_rocksdb_Transaction + * Method: put + * Signature: (J[BII[BII)V + */ +void Java_org_rocksdb_Transaction_put__J_3BII_3BII( + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jbyteArray jkey, jint jkey_off, + jint jkey_part_len, jbyteArray jval, jint jval_off, jint jval_len) { + auto* txn = reinterpret_cast(jhandle); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_part_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Put(key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { return; } - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); } /* * Class: org_rocksdb_Transaction - * Method: put - * Signature: (J[BI[BIJZ)V + * Method: putDirect + * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJZ)V */ -void Java_org_rocksdb_Transaction_put__J_3BI_3BIJZ( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey, - jint jkey_part_len, jbyteArray jval, jint jval_len, +void Java_org_rocksdb_Transaction_putDirect__JLjava_nio_ByteBuffer_2IILjava_nio_ByteBuffer_2IIJZ( + JNIEnv* env, jclass, jlong jhandle, jobject jkey_bb, jint jkey_off, + jint jkey_len, jobject jval_bb, jint jval_off, jint jval_len, jlong jcolumn_family_handle, jboolean jassume_tracked) { auto* txn = reinterpret_cast(jhandle); auto* column_family_handle = reinterpret_cast( jcolumn_family_handle); - FnWriteKV fn_put = - std::bind(&ROCKSDB_NAMESPACE::Transaction::Put, txn, - column_family_handle, std::placeholders::_1, - std::placeholders::_2, jassume_tracked); - txn_write_kv_helper(env, fn_put, jkey, jkey_part_len, jval, jval_len); + try { + ROCKSDB_NAMESPACE::JDirectBufferSlice key(env, jkey_bb, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JDirectBufferSlice value(env, jval_bb, jval_off, + jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Put(column_family_handle, key.slice(), value.slice(), + jassume_tracked)); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } /* * Class: org_rocksdb_Transaction - * Method: put - * Signature: (J[BI[BI)V + * Method: putDirect + * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;II)V */ -void Java_org_rocksdb_Transaction_put__J_3BI_3BI(JNIEnv* env, jobject /*jobj*/, - jlong jhandle, jbyteArray jkey, - jint jkey_part_len, - jbyteArray jval, - jint jval_len) { +void Java_org_rocksdb_Transaction_putDirect__JLjava_nio_ByteBuffer_2IILjava_nio_ByteBuffer_2II( + JNIEnv* env, jclass, jlong jhandle, jobject jkey_bb, jint jkey_off, + jint jkey_len, jobject jval_bb, jint jval_off, jint jval_len) { auto* txn = reinterpret_cast(jhandle); - FnWriteKV fn_put = - std::bind( - &ROCKSDB_NAMESPACE::Transaction::Put, txn, std::placeholders::_1, - std::placeholders::_2); - txn_write_kv_helper(env, fn_put, jkey, jkey_part_len, jval, jval_len); + try { + ROCKSDB_NAMESPACE::JDirectBufferSlice key(env, jkey_bb, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JDirectBufferSlice value(env, jval_bb, jval_off, + jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Put(key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } typedef std::function(jhandle); @@ -765,7 +750,7 @@ void Java_org_rocksdb_Transaction_put__J_3_3BI_3_3BIJZ( * Signature: (J[[BI[[BI)V */ void Java_org_rocksdb_Transaction_put__J_3_3BI_3_3BI( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, jobjectArray jvalue_parts, jint jvalue_parts_len) { auto* txn = reinterpret_cast(jhandle); FnWriteKVParts fn_put_parts = std::bind(jhandle); auto* column_family_handle = reinterpret_cast( jcolumn_family_handle); - FnWriteKV fn_merge = - std::bind(&ROCKSDB_NAMESPACE::Transaction::Merge, txn, - column_family_handle, std::placeholders::_1, - std::placeholders::_2, jassume_tracked); - txn_write_kv_helper(env, fn_merge, jkey, jkey_part_len, jval, jval_len); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_part_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Merge(column_family_handle, key.slice(), value.slice(), + jassume_tracked)); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } /* * Class: org_rocksdb_Transaction * Method: merge - * Signature: (J[BI[BI)V + * Signature: (J[BII[BII)V + */ +void Java_org_rocksdb_Transaction_merge__J_3BII_3BII( + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jbyteArray jkey, jint jkey_off, + jint jkey_part_len, jbyteArray jval, jint jval_off, jint jval_len) { + auto* txn = reinterpret_cast(jhandle); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_part_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Merge(key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } +} + +/* + * Class: org_rocksdb_Transaction + * Method: mergeDirect + * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJZ)V */ -void Java_org_rocksdb_Transaction_merge__J_3BI_3BI( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey, - jint jkey_part_len, jbyteArray jval, jint jval_len) { +JNIEXPORT void JNICALL +Java_org_rocksdb_Transaction_mergeDirect__JLjava_nio_ByteBuffer_2IILjava_nio_ByteBuffer_2IIJZ( + JNIEnv* env, jclass, jlong jhandle, jobject jkey_bb, jint jkey_off, + jint jkey_len, jobject jval_bb, jint jval_off, jint jval_len, + jlong jcolumn_family_handle, jboolean jassume_tracked) { auto* txn = reinterpret_cast(jhandle); - FnWriteKV fn_merge = - std::bind( - &ROCKSDB_NAMESPACE::Transaction::Merge, txn, std::placeholders::_1, - std::placeholders::_2); - txn_write_kv_helper(env, fn_merge, jkey, jkey_part_len, jval, jval_len); + auto* column_family_handle = + reinterpret_cast( + jcolumn_family_handle); + try { + ROCKSDB_NAMESPACE::JDirectBufferSlice key(env, jkey_bb, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JDirectBufferSlice value(env, jval_bb, jval_off, + jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Merge(column_family_handle, key.slice(), value.slice(), + jassume_tracked)); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } +} + +/* + * Class: org_rocksdb_Transaction + * Method: mergeDirect + * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;II)V + */ +JNIEXPORT void JNICALL +Java_org_rocksdb_Transaction_mergeDirect__JLjava_nio_ByteBuffer_2IILjava_nio_ByteBuffer_2II( + JNIEnv* env, jclass, jlong jhandle, jobject jkey_bb, jint jkey_off, + jint jkey_len, jobject jval_bb, jint jval_off, jint jval_len) { + auto* txn = reinterpret_cast(jhandle); + try { + ROCKSDB_NAMESPACE::JDirectBufferSlice key(env, jkey_bb, jkey_off, jkey_len); + ROCKSDB_NAMESPACE::JDirectBufferSlice value(env, jval_bb, jval_off, + jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->Merge(key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } typedef std::function(jhandle); auto* column_family_handle = @@ -871,7 +906,7 @@ void Java_org_rocksdb_Transaction_delete__J_3BIJZ( * Method: delete * Signature: (J[BI)V */ -void Java_org_rocksdb_Transaction_delete__J_3BI(JNIEnv* env, jobject /*jobj*/, +void Java_org_rocksdb_Transaction_delete__J_3BI(JNIEnv* env, jclass /*jobj*/, jlong jhandle, jbyteArray jkey, jint jkey_part_len) { auto* txn = reinterpret_cast(jhandle); @@ -939,7 +974,7 @@ void txn_write_k_parts_helper(JNIEnv* env, * Signature: (J[[BIJZ)V */ void Java_org_rocksdb_Transaction_delete__J_3_3BIJZ( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, jlong jcolumn_family_handle, jboolean jassume_tracked) { auto* txn = reinterpret_cast(jhandle); @@ -960,7 +995,7 @@ void Java_org_rocksdb_Transaction_delete__J_3_3BIJZ( * Method: delete * Signature: (J[[BI)V */ -void Java_org_rocksdb_Transaction_delete__J_3_3BI(JNIEnv* env, jobject /*jobj*/, +void Java_org_rocksdb_Transaction_delete__J_3_3BI(JNIEnv* env, jclass /*jobj*/, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len) { @@ -977,7 +1012,7 @@ void Java_org_rocksdb_Transaction_delete__J_3_3BI(JNIEnv* env, jobject /*jobj*/, * Signature: (J[BIJZ)V */ void Java_org_rocksdb_Transaction_singleDelete__J_3BIJZ( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jbyteArray jkey, jint jkey_part_len, jlong jcolumn_family_handle, jboolean jassume_tracked) { auto* txn = reinterpret_cast(jhandle); auto* column_family_handle = @@ -998,7 +1033,7 @@ void Java_org_rocksdb_Transaction_singleDelete__J_3BIJZ( * Signature: (J[BI)V */ void Java_org_rocksdb_Transaction_singleDelete__J_3BI(JNIEnv* env, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jbyteArray jkey, jint jkey_part_len) { @@ -1016,7 +1051,7 @@ void Java_org_rocksdb_Transaction_singleDelete__J_3BI(JNIEnv* env, * Signature: (J[[BIJZ)V */ void Java_org_rocksdb_Transaction_singleDelete__J_3_3BIJZ( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, jlong jcolumn_family_handle, jboolean jassume_tracked) { auto* txn = reinterpret_cast(jhandle); @@ -1039,7 +1074,7 @@ void Java_org_rocksdb_Transaction_singleDelete__J_3_3BIJZ( * Signature: (J[[BI)V */ void Java_org_rocksdb_Transaction_singleDelete__J_3_3BI(JNIEnv* env, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len) { @@ -1058,21 +1093,22 @@ void Java_org_rocksdb_Transaction_singleDelete__J_3_3BI(JNIEnv* env, * Signature: (J[BI[BIJ)V */ void Java_org_rocksdb_Transaction_putUntracked__J_3BI_3BIJ( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jbyteArray jkey, jint jkey_part_len, jbyteArray jval, jint jval_len, jlong jcolumn_family_handle) { auto* txn = reinterpret_cast(jhandle); auto* column_family_handle = reinterpret_cast( jcolumn_family_handle); - FnWriteKV fn_put_untracked = - std::bind( - &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, - column_family_handle, std::placeholders::_1, std::placeholders::_2); - txn_write_kv_helper(env, fn_put_untracked, jkey, jkey_part_len, jval, - jval_len); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, 0, jkey_part_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, 0, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, + txn->PutUntracked(column_family_handle, key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } /* @@ -1081,16 +1117,17 @@ void Java_org_rocksdb_Transaction_putUntracked__J_3BI_3BIJ( * Signature: (J[BI[BI)V */ void Java_org_rocksdb_Transaction_putUntracked__J_3BI_3BI( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jbyteArray jkey, jint jkey_part_len, jbyteArray jval, jint jval_len) { auto* txn = reinterpret_cast(jhandle); - FnWriteKV fn_put_untracked = - std::bind( - &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, - std::placeholders::_1, std::placeholders::_2); - txn_write_kv_helper(env, fn_put_untracked, jkey, jkey_part_len, jval, - jval_len); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, 0, jkey_part_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, 0, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, txn->PutUntracked(key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } /* @@ -1099,7 +1136,7 @@ void Java_org_rocksdb_Transaction_putUntracked__J_3BI_3BI( * Signature: (J[[BI[[BIJ)V */ void Java_org_rocksdb_Transaction_putUntracked__J_3_3BI_3_3BIJ( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, jobjectArray jvalue_parts, jint jvalue_parts_len, jlong jcolumn_family_handle) { auto* txn = reinterpret_cast(jhandle); @@ -1122,7 +1159,7 @@ void Java_org_rocksdb_Transaction_putUntracked__J_3_3BI_3_3BIJ( * Signature: (J[[BI[[BI)V */ void Java_org_rocksdb_Transaction_putUntracked__J_3_3BI_3_3BI( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, jobjectArray jvalue_parts, jint jvalue_parts_len) { auto* txn = reinterpret_cast(jhandle); FnWriteKVParts fn_put_parts_untracked = std::bind(jhandle); auto* column_family_handle = reinterpret_cast( jcolumn_family_handle); - FnWriteKV fn_merge_untracked = - std::bind( - &ROCKSDB_NAMESPACE::Transaction::MergeUntracked, txn, - column_family_handle, std::placeholders::_1, std::placeholders::_2); - txn_write_kv_helper(env, fn_merge_untracked, jkey, jkey_part_len, jval, - jval_len); + try { + ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_part_len); + ROCKSDB_NAMESPACE::JByteArraySlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, + txn->MergeUntracked(column_family_handle, key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } /* * Class: org_rocksdb_Transaction - * Method: mergeUntracked - * Signature: (J[BI[BI)V + * Method: mergeUntrackedDirect + * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJ)V */ -void Java_org_rocksdb_Transaction_mergeUntracked__J_3BI_3BI( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey, - jint jkey_part_len, jbyteArray jval, jint jval_len) { +void Java_org_rocksdb_Transaction_mergeUntrackedDirect( + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jobject jkey, jint jkey_off, + jint jkey_part_len, jobject jval, jint jval_off, jint jval_len, + jlong jcolumn_family_handle) { auto* txn = reinterpret_cast(jhandle); - FnWriteKV fn_merge_untracked = - std::bind( - &ROCKSDB_NAMESPACE::Transaction::MergeUntracked, txn, - std::placeholders::_1, std::placeholders::_2); - txn_write_kv_helper(env, fn_merge_untracked, jkey, jkey_part_len, jval, - jval_len); + auto* column_family_handle = + reinterpret_cast( + jcolumn_family_handle); + try { + ROCKSDB_NAMESPACE::JDirectBufferSlice key(env, jkey, jkey_off, + jkey_part_len); + ROCKSDB_NAMESPACE::JDirectBufferSlice value(env, jval, jval_off, jval_len); + ROCKSDB_NAMESPACE::KVException::ThrowOnError( + env, + txn->MergeUntracked(column_family_handle, key.slice(), value.slice())); + } catch (ROCKSDB_NAMESPACE::KVException&) { + return; + } } /* @@ -1181,7 +1226,7 @@ void Java_org_rocksdb_Transaction_mergeUntracked__J_3BI_3BI( * Signature: (J[BIJ)V */ void Java_org_rocksdb_Transaction_deleteUntracked__J_3BIJ( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jbyteArray jkey, jint jkey_part_len, jlong jcolumn_family_handle) { auto* txn = reinterpret_cast(jhandle); auto* column_family_handle = @@ -1201,7 +1246,7 @@ void Java_org_rocksdb_Transaction_deleteUntracked__J_3BIJ( * Signature: (J[BI)V */ void Java_org_rocksdb_Transaction_deleteUntracked__J_3BI(JNIEnv* env, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jbyteArray jkey, jint jkey_part_len) { @@ -1219,7 +1264,7 @@ void Java_org_rocksdb_Transaction_deleteUntracked__J_3BI(JNIEnv* env, * Signature: (J[[BIJ)V */ void Java_org_rocksdb_Transaction_deleteUntracked__J_3_3BIJ( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, jlong jcolumn_family_handle) { auto* txn = reinterpret_cast(jhandle); auto* column_family_handle = @@ -1241,7 +1286,7 @@ void Java_org_rocksdb_Transaction_deleteUntracked__J_3_3BIJ( * Signature: (J[[BI)V */ void Java_org_rocksdb_Transaction_deleteUntracked__J_3_3BI( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len) { auto* txn = reinterpret_cast(jhandle); FnWriteKParts fn_delete_untracked_parts = @@ -1258,7 +1303,7 @@ void Java_org_rocksdb_Transaction_deleteUntracked__J_3_3BI( * Method: putLogData * Signature: (J[BI)V */ -void Java_org_rocksdb_Transaction_putLogData(JNIEnv* env, jobject /*jobj*/, +void Java_org_rocksdb_Transaction_putLogData(JNIEnv* env, jclass /*jobj*/, jlong jhandle, jbyteArray jkey, jint jkey_part_len) { auto* txn = reinterpret_cast(jhandle); @@ -1285,7 +1330,7 @@ void Java_org_rocksdb_Transaction_putLogData(JNIEnv* env, jobject /*jobj*/, * Signature: (J)V */ void Java_org_rocksdb_Transaction_disableIndexing(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); txn->DisableIndexing(); @@ -1297,7 +1342,7 @@ void Java_org_rocksdb_Transaction_disableIndexing(JNIEnv* /*env*/, * Signature: (J)V */ void Java_org_rocksdb_Transaction_enableIndexing(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); txn->EnableIndexing(); @@ -1308,7 +1353,7 @@ void Java_org_rocksdb_Transaction_enableIndexing(JNIEnv* /*env*/, * Method: getNumKeys * Signature: (J)J */ -jlong Java_org_rocksdb_Transaction_getNumKeys(JNIEnv* /*env*/, jobject /*jobj*/, +jlong Java_org_rocksdb_Transaction_getNumKeys(JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); return txn->GetNumKeys(); @@ -1319,7 +1364,7 @@ jlong Java_org_rocksdb_Transaction_getNumKeys(JNIEnv* /*env*/, jobject /*jobj*/, * Method: getNumPuts * Signature: (J)J */ -jlong Java_org_rocksdb_Transaction_getNumPuts(JNIEnv* /*env*/, jobject /*jobj*/, +jlong Java_org_rocksdb_Transaction_getNumPuts(JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); return txn->GetNumPuts(); @@ -1331,7 +1376,7 @@ jlong Java_org_rocksdb_Transaction_getNumPuts(JNIEnv* /*env*/, jobject /*jobj*/, * Signature: (J)J */ jlong Java_org_rocksdb_Transaction_getNumDeletes(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); return txn->GetNumDeletes(); @@ -1343,7 +1388,7 @@ jlong Java_org_rocksdb_Transaction_getNumDeletes(JNIEnv* /*env*/, * Signature: (J)J */ jlong Java_org_rocksdb_Transaction_getNumMerges(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); return txn->GetNumMerges(); @@ -1355,7 +1400,7 @@ jlong Java_org_rocksdb_Transaction_getNumMerges(JNIEnv* /*env*/, * Signature: (J)J */ jlong Java_org_rocksdb_Transaction_getElapsedTime(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); return txn->GetElapsedTime(); @@ -1367,7 +1412,7 @@ jlong Java_org_rocksdb_Transaction_getElapsedTime(JNIEnv* /*env*/, * Signature: (J)J */ jlong Java_org_rocksdb_Transaction_getWriteBatch(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); return GET_CPLUSPLUS_POINTER(txn->GetWriteBatch()); @@ -1379,8 +1424,7 @@ jlong Java_org_rocksdb_Transaction_getWriteBatch(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_Transaction_setLockTimeout(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle, + jclass /*jcls*/, jlong jhandle, jlong jlock_timeout) { auto* txn = reinterpret_cast(jhandle); txn->SetLockTimeout(jlock_timeout); @@ -1392,7 +1436,7 @@ void Java_org_rocksdb_Transaction_setLockTimeout(JNIEnv* /*env*/, * Signature: (J)J */ jlong Java_org_rocksdb_Transaction_getWriteOptions(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); return GET_CPLUSPLUS_POINTER(txn->GetWriteOptions()); @@ -1404,7 +1448,7 @@ jlong Java_org_rocksdb_Transaction_getWriteOptions(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_Transaction_setWriteOptions(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jlong jwrite_options_handle) { auto* txn = reinterpret_cast(jhandle); @@ -1419,7 +1463,7 @@ void Java_org_rocksdb_Transaction_setWriteOptions(JNIEnv* /*env*/, * Signature: (J[BIJ)V */ void Java_org_rocksdb_Transaction_undoGetForUpdate__J_3BIJ( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey, + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jbyteArray jkey, jint jkey_part_len, jlong jcolumn_family_handle) { auto* txn = reinterpret_cast(jhandle); auto* column_family_handle = @@ -1444,7 +1488,7 @@ void Java_org_rocksdb_Transaction_undoGetForUpdate__J_3BIJ( * Signature: (J[BI)V */ void Java_org_rocksdb_Transaction_undoGetForUpdate__J_3BI(JNIEnv* env, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jbyteArray jkey, jint jkey_part_len) { @@ -1468,7 +1512,7 @@ void Java_org_rocksdb_Transaction_undoGetForUpdate__J_3BI(JNIEnv* env, * Signature: (JJ)V */ void Java_org_rocksdb_Transaction_rebuildFromWriteBatch( - JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jwrite_batch_handle) { + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jlong jwrite_batch_handle) { auto* txn = reinterpret_cast(jhandle); auto* write_batch = reinterpret_cast(jwrite_batch_handle); @@ -1484,7 +1528,7 @@ void Java_org_rocksdb_Transaction_rebuildFromWriteBatch( * Signature: (J)J */ jlong Java_org_rocksdb_Transaction_getCommitTimeWriteBatch(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); return GET_CPLUSPLUS_POINTER(txn->GetCommitTimeWriteBatch()); @@ -1495,8 +1539,8 @@ jlong Java_org_rocksdb_Transaction_getCommitTimeWriteBatch(JNIEnv* /*env*/, * Method: setLogNumber * Signature: (JJ)V */ -void Java_org_rocksdb_Transaction_setLogNumber(JNIEnv* /*env*/, - jobject /*jobj*/, jlong jhandle, +void Java_org_rocksdb_Transaction_setLogNumber(JNIEnv* /*env*/, jclass /*jcls*/, + jlong jhandle, jlong jlog_number) { auto* txn = reinterpret_cast(jhandle); txn->SetLogNumber(jlog_number); @@ -1508,7 +1552,7 @@ void Java_org_rocksdb_Transaction_setLogNumber(JNIEnv* /*env*/, * Signature: (J)J */ jlong Java_org_rocksdb_Transaction_getLogNumber(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); return txn->GetLogNumber(); @@ -1519,7 +1563,7 @@ jlong Java_org_rocksdb_Transaction_getLogNumber(JNIEnv* /*env*/, * Method: setName * Signature: (JLjava/lang/String;)V */ -void Java_org_rocksdb_Transaction_setName(JNIEnv* env, jobject /*jobj*/, +void Java_org_rocksdb_Transaction_setName(JNIEnv* env, jclass /*jobj*/, jlong jhandle, jstring jname) { auto* txn = reinterpret_cast(jhandle); const char* name = env->GetStringUTFChars(jname, nullptr); @@ -1542,7 +1586,7 @@ void Java_org_rocksdb_Transaction_setName(JNIEnv* env, jobject /*jobj*/, * Method: getName * Signature: (J)Ljava/lang/String; */ -jstring Java_org_rocksdb_Transaction_getName(JNIEnv* env, jobject /*jobj*/, +jstring Java_org_rocksdb_Transaction_getName(JNIEnv* env, jclass /*jobj*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::TransactionName name = txn->GetName(); @@ -1554,7 +1598,7 @@ jstring Java_org_rocksdb_Transaction_getName(JNIEnv* env, jobject /*jobj*/, * Method: getID * Signature: (J)J */ -jlong Java_org_rocksdb_Transaction_getID(JNIEnv* /*env*/, jobject /*jobj*/, +jlong Java_org_rocksdb_Transaction_getID(JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::TransactionID id = txn->GetID(); @@ -1567,7 +1611,7 @@ jlong Java_org_rocksdb_Transaction_getID(JNIEnv* /*env*/, jobject /*jobj*/, * Signature: (J)Z */ jboolean Java_org_rocksdb_Transaction_isDeadlockDetect(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); return static_cast(txn->IsDeadlockDetect()); @@ -1597,7 +1641,7 @@ jobject Java_org_rocksdb_Transaction_getWaitingTxns(JNIEnv* env, * Method: getState * Signature: (J)B */ -jbyte Java_org_rocksdb_Transaction_getState(JNIEnv* /*env*/, jobject /*jobj*/, +jbyte Java_org_rocksdb_Transaction_getState(JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::Transaction::TransactionState txn_status = txn->GetState(); @@ -1636,7 +1680,7 @@ jbyte Java_org_rocksdb_Transaction_getState(JNIEnv* /*env*/, jobject /*jobj*/, * Method: getId * Signature: (J)J */ -jlong Java_org_rocksdb_Transaction_getId(JNIEnv* /*env*/, jobject /*jobj*/, +jlong Java_org_rocksdb_Transaction_getId(JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* txn = reinterpret_cast(jhandle); uint64_t id = txn->GetId(); diff --git a/java/rocksjni/transaction_db.cc b/java/rocksjni/transaction_db.cc index 0adf856065e..595360acdeb 100644 --- a/java/rocksjni/transaction_db.cc +++ b/java/rocksjni/transaction_db.cc @@ -147,8 +147,8 @@ jlongArray Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2_3_3B_3J( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_TransactionDB_disposeInternal(JNIEnv*, jobject, - jlong jhandle) { +void Java_org_rocksdb_TransactionDB_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { auto* txn_db = reinterpret_cast(jhandle); assert(txn_db != nullptr); delete txn_db; @@ -173,7 +173,7 @@ void Java_org_rocksdb_TransactionDB_closeDatabase(JNIEnv* env, jclass, * Signature: (JJ)J */ jlong Java_org_rocksdb_TransactionDB_beginTransaction__JJ( - JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle) { + JNIEnv*, jclass, jlong jhandle, jlong jwrite_options_handle) { auto* txn_db = reinterpret_cast(jhandle); auto* write_options = reinterpret_cast(jwrite_options_handle); @@ -188,7 +188,7 @@ jlong Java_org_rocksdb_TransactionDB_beginTransaction__JJ( * Signature: (JJJ)J */ jlong Java_org_rocksdb_TransactionDB_beginTransaction__JJJ( - JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle, + JNIEnv*, jclass, jlong jhandle, jlong jwrite_options_handle, jlong jtxn_options_handle) { auto* txn_db = reinterpret_cast(jhandle); auto* write_options = @@ -206,7 +206,7 @@ jlong Java_org_rocksdb_TransactionDB_beginTransaction__JJJ( * Signature: (JJJ)J */ jlong Java_org_rocksdb_TransactionDB_beginTransaction_1withOld__JJJ( - JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle, + JNIEnv*, jclass, jlong jhandle, jlong jwrite_options_handle, jlong jold_txn_handle) { auto* txn_db = reinterpret_cast(jhandle); auto* write_options = @@ -231,7 +231,7 @@ jlong Java_org_rocksdb_TransactionDB_beginTransaction_1withOld__JJJ( * Signature: (JJJJ)J */ jlong Java_org_rocksdb_TransactionDB_beginTransaction_1withOld__JJJJ( - JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle, + JNIEnv*, jclass, jlong jhandle, jlong jwrite_options_handle, jlong jtxn_options_handle, jlong jold_txn_handle) { auto* txn_db = reinterpret_cast(jhandle); auto* write_options = @@ -256,7 +256,7 @@ jlong Java_org_rocksdb_TransactionDB_beginTransaction_1withOld__JJJJ( * Method: getTransactionByName * Signature: (JLjava/lang/String;)J */ -jlong Java_org_rocksdb_TransactionDB_getTransactionByName(JNIEnv* env, jobject, +jlong Java_org_rocksdb_TransactionDB_getTransactionByName(JNIEnv* env, jclass, jlong jhandle, jstring jname) { auto* txn_db = reinterpret_cast(jhandle); @@ -276,7 +276,7 @@ jlong Java_org_rocksdb_TransactionDB_getTransactionByName(JNIEnv* env, jobject, * Signature: (J)[J */ jlongArray Java_org_rocksdb_TransactionDB_getAllPreparedTransactions( - JNIEnv* env, jobject, jlong jhandle) { + JNIEnv* env, jclass, jlong jhandle) { auto* txn_db = reinterpret_cast(jhandle); std::vector txns; txn_db->GetAllPreparedTransactions(&txns); @@ -310,7 +310,7 @@ jlongArray Java_org_rocksdb_TransactionDB_getAllPreparedTransactions( * Method: getLockStatusData * Signature: (J)Ljava/util/Map; */ -jobject Java_org_rocksdb_TransactionDB_getLockStatusData(JNIEnv* env, jobject, +jobject Java_org_rocksdb_TransactionDB_getLockStatusData(JNIEnv* env, jclass, jlong jhandle) { auto* txn_db = reinterpret_cast(jhandle); const std::unordered_multimap @@ -360,7 +360,7 @@ jobject Java_org_rocksdb_TransactionDB_getLockStatusData(JNIEnv* env, jobject, * Signature: (J)[Lorg/rocksdb/TransactionDB/DeadlockPath; */ jobjectArray Java_org_rocksdb_TransactionDB_getDeadlockInfoBuffer( - JNIEnv* env, jobject jobj, jlong jhandle) { + JNIEnv* env, jclass jobj, jlong jhandle) { auto* txn_db = reinterpret_cast(jhandle); const std::vector deadlock_info_buffer = txn_db->GetDeadlockInfoBuffer(); @@ -445,7 +445,7 @@ jobjectArray Java_org_rocksdb_TransactionDB_getDeadlockInfoBuffer( * Signature: (JI)V */ void Java_org_rocksdb_TransactionDB_setDeadlockInfoBufferSize( - JNIEnv*, jobject, jlong jhandle, jint jdeadlock_info_buffer_size) { + JNIEnv*, jclass, jlong jhandle, jint jdeadlock_info_buffer_size) { auto* txn_db = reinterpret_cast(jhandle); txn_db->SetDeadlockInfoBufferSize(jdeadlock_info_buffer_size); } diff --git a/java/rocksjni/transaction_db_options.cc b/java/rocksjni/transaction_db_options.cc index 4cf27121e9a..813c5372891 100644 --- a/java/rocksjni/transaction_db_options.cc +++ b/java/rocksjni/transaction_db_options.cc @@ -31,7 +31,7 @@ jlong Java_org_rocksdb_TransactionDBOptions_newTransactionDBOptions( * Signature: (J)J */ jlong Java_org_rocksdb_TransactionDBOptions_getMaxNumLocks(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -44,7 +44,7 @@ jlong Java_org_rocksdb_TransactionDBOptions_getMaxNumLocks(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_TransactionDBOptions_setMaxNumLocks( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jlong jmax_num_locks) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jmax_num_locks) { auto* opts = reinterpret_cast(jhandle); opts->max_num_locks = jmax_num_locks; @@ -56,7 +56,7 @@ void Java_org_rocksdb_TransactionDBOptions_setMaxNumLocks( * Signature: (J)J */ jlong Java_org_rocksdb_TransactionDBOptions_getNumStripes(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -69,7 +69,7 @@ jlong Java_org_rocksdb_TransactionDBOptions_getNumStripes(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_TransactionDBOptions_setNumStripes(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jlong jnum_stripes) { auto* opts = @@ -83,7 +83,7 @@ void Java_org_rocksdb_TransactionDBOptions_setNumStripes(JNIEnv* /*env*/, * Signature: (J)J */ jlong Java_org_rocksdb_TransactionDBOptions_getTransactionLockTimeout( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return opts->transaction_lock_timeout; @@ -95,7 +95,7 @@ jlong Java_org_rocksdb_TransactionDBOptions_getTransactionLockTimeout( * Signature: (JJ)V */ void Java_org_rocksdb_TransactionDBOptions_setTransactionLockTimeout( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jtransaction_lock_timeout) { auto* opts = reinterpret_cast(jhandle); @@ -108,7 +108,7 @@ void Java_org_rocksdb_TransactionDBOptions_setTransactionLockTimeout( * Signature: (J)J */ jlong Java_org_rocksdb_TransactionDBOptions_getDefaultLockTimeout( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return opts->default_lock_timeout; @@ -120,7 +120,7 @@ jlong Java_org_rocksdb_TransactionDBOptions_getDefaultLockTimeout( * Signature: (JJ)V */ void Java_org_rocksdb_TransactionDBOptions_setDefaultLockTimeout( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jdefault_lock_timeout) { auto* opts = reinterpret_cast(jhandle); @@ -133,7 +133,7 @@ void Java_org_rocksdb_TransactionDBOptions_setDefaultLockTimeout( * Signature: (J)B */ jbyte Java_org_rocksdb_TransactionDBOptions_getWritePolicy(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -147,7 +147,7 @@ jbyte Java_org_rocksdb_TransactionDBOptions_getWritePolicy(JNIEnv* /*env*/, * Signature: (JB)V */ void Java_org_rocksdb_TransactionDBOptions_setWritePolicy(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jbyte jwrite_policy) { auto* opts = @@ -162,8 +162,8 @@ void Java_org_rocksdb_TransactionDBOptions_setWritePolicy(JNIEnv* /*env*/, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_TransactionDBOptions_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_TransactionDBOptions_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jhandle) { delete reinterpret_cast(jhandle); } diff --git a/java/rocksjni/transaction_log.cc b/java/rocksjni/transaction_log.cc index 97c3bb30122..955a2dfcb27 100644 --- a/java/rocksjni/transaction_log.cc +++ b/java/rocksjni/transaction_log.cc @@ -20,9 +20,9 @@ * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_TransactionLogIterator_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_TransactionLogIterator_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong handle) { delete reinterpret_cast(handle); } @@ -32,7 +32,7 @@ void Java_org_rocksdb_TransactionLogIterator_disposeInternal(JNIEnv* /*env*/, * Signature: (J)Z */ jboolean Java_org_rocksdb_TransactionLogIterator_isValid(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle) { return reinterpret_cast(handle) ->Valid(); @@ -44,7 +44,7 @@ jboolean Java_org_rocksdb_TransactionLogIterator_isValid(JNIEnv* /*env*/, * Signature: (J)V */ void Java_org_rocksdb_TransactionLogIterator_next(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle) { reinterpret_cast(handle)->Next(); } @@ -55,7 +55,7 @@ void Java_org_rocksdb_TransactionLogIterator_next(JNIEnv* /*env*/, * Signature: (J)V */ void Java_org_rocksdb_TransactionLogIterator_status(JNIEnv* env, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle) { ROCKSDB_NAMESPACE::Status s = reinterpret_cast(handle) @@ -71,7 +71,7 @@ void Java_org_rocksdb_TransactionLogIterator_status(JNIEnv* env, * Signature: (J)Lorg/rocksdb/TransactionLogIterator$BatchResult */ jobject Java_org_rocksdb_TransactionLogIterator_getBatch(JNIEnv* env, - jobject /*jobj*/, + jclass /*jcls*/, jlong handle) { ROCKSDB_NAMESPACE::BatchResult batch_result = reinterpret_cast(handle) diff --git a/java/rocksjni/transaction_notifier.cc b/java/rocksjni/transaction_notifier.cc index cefeb648a56..d84e0d5b800 100644 --- a/java/rocksjni/transaction_notifier.cc +++ b/java/rocksjni/transaction_notifier.cc @@ -32,8 +32,8 @@ jlong Java_org_rocksdb_AbstractTransactionNotifier_createNewTransactionNotifier( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_AbstractTransactionNotifier_disposeInternal( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { +void Java_org_rocksdb_AbstractTransactionNotifier_disposeInternalJni( + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { // TODO(AR) refactor to use JniCallback::JniCallback // when https://github.com/facebook/rocksdb/pull/1241/ is merged std::shared_ptr* handle = diff --git a/java/rocksjni/transaction_options.cc b/java/rocksjni/transaction_options.cc index dcf363e148a..0a2414fc6d1 100644 --- a/java/rocksjni/transaction_options.cc +++ b/java/rocksjni/transaction_options.cc @@ -29,7 +29,7 @@ jlong Java_org_rocksdb_TransactionOptions_newTransactionOptions( * Signature: (J)Z */ jboolean Java_org_rocksdb_TransactionOptions_isSetSnapshot(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -42,7 +42,7 @@ jboolean Java_org_rocksdb_TransactionOptions_isSetSnapshot(JNIEnv* /*env*/, * Signature: (JZ)V */ void Java_org_rocksdb_TransactionOptions_setSetSnapshot( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean jset_snapshot) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jboolean jset_snapshot) { auto* opts = reinterpret_cast(jhandle); opts->set_snapshot = jset_snapshot; @@ -54,7 +54,7 @@ void Java_org_rocksdb_TransactionOptions_setSetSnapshot( * Signature: (J)Z */ jboolean Java_org_rocksdb_TransactionOptions_isDeadlockDetect(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -67,7 +67,7 @@ jboolean Java_org_rocksdb_TransactionOptions_isDeadlockDetect(JNIEnv* /*env*/, * Signature: (JZ)V */ void Java_org_rocksdb_TransactionOptions_setDeadlockDetect( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jboolean jdeadlock_detect) { auto* opts = reinterpret_cast(jhandle); @@ -80,7 +80,7 @@ void Java_org_rocksdb_TransactionOptions_setDeadlockDetect( * Signature: (J)J */ jlong Java_org_rocksdb_TransactionOptions_getLockTimeout(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -93,7 +93,7 @@ jlong Java_org_rocksdb_TransactionOptions_getLockTimeout(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_TransactionOptions_setLockTimeout(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jlong jlock_timeout) { auto* opts = @@ -107,7 +107,7 @@ void Java_org_rocksdb_TransactionOptions_setLockTimeout(JNIEnv* /*env*/, * Signature: (J)J */ jlong Java_org_rocksdb_TransactionOptions_getExpiration(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -120,7 +120,7 @@ jlong Java_org_rocksdb_TransactionOptions_getExpiration(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_TransactionOptions_setExpiration(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle, jlong jexpiration) { auto* opts = @@ -134,7 +134,7 @@ void Java_org_rocksdb_TransactionOptions_setExpiration(JNIEnv* /*env*/, * Signature: (J)J */ jlong Java_org_rocksdb_TransactionOptions_getDeadlockDetectDepth( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return opts->deadlock_detect_depth; @@ -146,7 +146,7 @@ jlong Java_org_rocksdb_TransactionOptions_getDeadlockDetectDepth( * Signature: (JJ)V */ void Java_org_rocksdb_TransactionOptions_setDeadlockDetectDepth( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jdeadlock_detect_depth) { auto* opts = reinterpret_cast(jhandle); @@ -159,7 +159,7 @@ void Java_org_rocksdb_TransactionOptions_setDeadlockDetectDepth( * Signature: (J)J */ jlong Java_org_rocksdb_TransactionOptions_getMaxWriteBatchSize(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); @@ -172,7 +172,7 @@ jlong Java_org_rocksdb_TransactionOptions_getMaxWriteBatchSize(JNIEnv* /*env*/, * Signature: (JJ)V */ void Java_org_rocksdb_TransactionOptions_setMaxWriteBatchSize( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jmax_write_batch_size) { auto* opts = reinterpret_cast(jhandle); @@ -184,8 +184,8 @@ void Java_org_rocksdb_TransactionOptions_setMaxWriteBatchSize( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_TransactionOptions_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_TransactionOptions_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jobj*/, + jlong jhandle) { delete reinterpret_cast(jhandle); } diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc index 1fe2083d994..8e23c10279b 100644 --- a/java/rocksjni/ttl.cc +++ b/java/rocksjni/ttl.cc @@ -154,7 +154,7 @@ jlongArray Java_org_rocksdb_TtlDB_openCF(JNIEnv* env, jclass, jlong jopt_handle, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_TtlDB_disposeInternal(JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_TtlDB_disposeInternalJni(JNIEnv*, jclass, jlong jhandle) { auto* ttl_db = reinterpret_cast(jhandle); assert(ttl_db != nullptr); delete ttl_db; @@ -181,7 +181,7 @@ void Java_org_rocksdb_TtlDB_closeDatabase(JNIEnv* /* env */, jclass, * Method: createColumnFamilyWithTtl * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;[BJI)J; */ -jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(JNIEnv* env, jobject, +jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(JNIEnv* env, jclass, jlong jdb_handle, jbyteArray jcolumn_name, jlong jcolumn_options, diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc index 6704e4a7ed9..f2f13b6b757 100644 --- a/java/rocksjni/write_batch.cc +++ b/java/rocksjni/write_batch.cc @@ -22,7 +22,6 @@ #include "rocksjni/cplusplus_to_java_convert.h" #include "rocksjni/portal.h" #include "rocksjni/writebatchhandlerjnicallback.h" -#include "table/scoped_arena_iterator.h" /* * Class: org_rocksdb_WriteBatch @@ -65,8 +64,8 @@ jlong Java_org_rocksdb_WriteBatch_newWriteBatch___3BI(JNIEnv* env, * Method: count0 * Signature: (J)I */ -jint Java_org_rocksdb_WriteBatch_count0(JNIEnv* /*env*/, jobject /*jobj*/, - jlong jwb_handle) { +jint Java_org_rocksdb_WriteBatch_count0Jni(JNIEnv* /*env*/, jclass /*jobj*/, + jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -78,8 +77,8 @@ jint Java_org_rocksdb_WriteBatch_count0(JNIEnv* /*env*/, jobject /*jobj*/, * Method: clear0 * Signature: (J)V */ -void Java_org_rocksdb_WriteBatch_clear0(JNIEnv* /*env*/, jobject /*jobj*/, - jlong jwb_handle) { +void Java_org_rocksdb_WriteBatch_clear0Jni(JNIEnv* /*env*/, jclass /*jobj*/, + jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -91,9 +90,9 @@ void Java_org_rocksdb_WriteBatch_clear0(JNIEnv* /*env*/, jobject /*jobj*/, * Method: setSavePoint0 * Signature: (J)V */ -void Java_org_rocksdb_WriteBatch_setSavePoint0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jwb_handle) { +void Java_org_rocksdb_WriteBatch_setSavePoint0Jni(JNIEnv* /*env*/, + jclass /*jobj*/, + jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -105,9 +104,9 @@ void Java_org_rocksdb_WriteBatch_setSavePoint0(JNIEnv* /*env*/, * Method: rollbackToSavePoint0 * Signature: (J)V */ -void Java_org_rocksdb_WriteBatch_rollbackToSavePoint0(JNIEnv* env, - jobject /*jobj*/, - jlong jwb_handle) { +void Java_org_rocksdb_WriteBatch_rollbackToSavePoint0Jni(JNIEnv* env, + jclass /*jobj*/, + jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -124,8 +123,8 @@ void Java_org_rocksdb_WriteBatch_rollbackToSavePoint0(JNIEnv* env, * Method: popSavePoint * Signature: (J)V */ -void Java_org_rocksdb_WriteBatch_popSavePoint(JNIEnv* env, jobject /*jobj*/, - jlong jwb_handle) { +void Java_org_rocksdb_WriteBatch_popSavePointJni(JNIEnv* env, jclass /*jobj*/, + jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -142,9 +141,10 @@ void Java_org_rocksdb_WriteBatch_popSavePoint(JNIEnv* env, jobject /*jobj*/, * Method: setMaxBytes * Signature: (JJ)V */ -void Java_org_rocksdb_WriteBatch_setMaxBytes(JNIEnv* /*env*/, jobject /*jobj*/, - jlong jwb_handle, - jlong jmax_bytes) { +void Java_org_rocksdb_WriteBatch_setMaxBytesJni(JNIEnv* /*env*/, + jclass /*jobj*/, + jlong jwb_handle, + jlong jmax_bytes) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -156,11 +156,9 @@ void Java_org_rocksdb_WriteBatch_setMaxBytes(JNIEnv* /*env*/, jobject /*jobj*/, * Method: put * Signature: (J[BI[BI)V */ -void Java_org_rocksdb_WriteBatch_put__J_3BI_3BI(JNIEnv* env, jobject jobj, - jlong jwb_handle, - jbyteArray jkey, jint jkey_len, - jbyteArray jentry_value, - jint jentry_value_len) { +void Java_org_rocksdb_WriteBatch_putJni__J_3BI_3BI( + JNIEnv* env, jclass, jlong jwb_handle, jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto put = [&wb](ROCKSDB_NAMESPACE::Slice key, @@ -168,8 +166,8 @@ void Java_org_rocksdb_WriteBatch_put__J_3BI_3BI(JNIEnv* env, jobject jobj, return wb->Put(key, value); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, - jentry_value, jentry_value_len); + ROCKSDB_NAMESPACE::JniUtil::kv_op(put, env, jkey, jkey_len, jentry_value, + jentry_value_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -180,8 +178,8 @@ void Java_org_rocksdb_WriteBatch_put__J_3BI_3BI(JNIEnv* env, jobject jobj, * Method: put * Signature: (J[BI[BIJ)V */ -void Java_org_rocksdb_WriteBatch_put__J_3BI_3BIJ( - JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len, +void Java_org_rocksdb_WriteBatch_putJni__J_3BI_3BIJ( + JNIEnv* env, jclass, jlong jwb_handle, jbyteArray jkey, jint jkey_len, jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -193,8 +191,8 @@ void Java_org_rocksdb_WriteBatch_put__J_3BI_3BIJ( return wb->Put(cf_handle, key, value); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, - jentry_value, jentry_value_len); + ROCKSDB_NAMESPACE::JniUtil::kv_op(put, env, jkey, jkey_len, jentry_value, + jentry_value_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -205,11 +203,11 @@ void Java_org_rocksdb_WriteBatch_put__J_3BI_3BIJ( * Method: putDirect * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJ)V */ -void Java_org_rocksdb_WriteBatch_putDirect(JNIEnv* env, jobject /*jobj*/, - jlong jwb_handle, jobject jkey, - jint jkey_offset, jint jkey_len, - jobject jval, jint jval_offset, - jint jval_len, jlong jcf_handle) { +void Java_org_rocksdb_WriteBatch_putDirectJni(JNIEnv* env, jclass /*jobj*/, + jlong jwb_handle, jobject jkey, + jint jkey_offset, jint jkey_len, + jobject jval, jint jval_offset, + jint jval_len, jlong jcf_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto* cf_handle = @@ -231,8 +229,8 @@ void Java_org_rocksdb_WriteBatch_putDirect(JNIEnv* env, jobject /*jobj*/, * Method: merge * Signature: (J[BI[BI)V */ -void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BI( - JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len, +void Java_org_rocksdb_WriteBatch_mergeJni__J_3BI_3BI( + JNIEnv* env, jclass, jlong jwb_handle, jbyteArray jkey, jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -241,7 +239,7 @@ void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BI( return wb->Merge(key, value); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, + ROCKSDB_NAMESPACE::JniUtil::kv_op(merge, env, jkey, jkey_len, jentry_value, jentry_value_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); @@ -253,8 +251,8 @@ void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BI( * Method: merge * Signature: (J[BI[BIJ)V */ -void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BIJ( - JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len, +void Java_org_rocksdb_WriteBatch_mergeJni__J_3BI_3BIJ( + JNIEnv* env, jclass, jlong jwb_handle, jbyteArray jkey, jint jkey_len, jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -266,7 +264,7 @@ void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BIJ( return wb->Merge(cf_handle, key, value); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, + ROCKSDB_NAMESPACE::JniUtil::kv_op(merge, env, jkey, jkey_len, jentry_value, jentry_value_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); @@ -278,14 +276,15 @@ void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BIJ( * Method: delete * Signature: (J[BI)V */ -void Java_org_rocksdb_WriteBatch_delete__J_3BI(JNIEnv* env, jobject jobj, - jlong jwb_handle, - jbyteArray jkey, jint jkey_len) { +void Java_org_rocksdb_WriteBatch_deleteJni__J_3BI(JNIEnv* env, jclass, + jlong jwb_handle, + jbyteArray jkey, + jint jkey_len) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto remove = [&wb](ROCKSDB_NAMESPACE::Slice key) { return wb->Delete(key); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::k_op(remove, env, jobj, jkey, jkey_len); + ROCKSDB_NAMESPACE::JniUtil::k_op(remove, env, jkey, jkey_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -296,10 +295,11 @@ void Java_org_rocksdb_WriteBatch_delete__J_3BI(JNIEnv* env, jobject jobj, * Method: delete * Signature: (J[BIJ)V */ -void Java_org_rocksdb_WriteBatch_delete__J_3BIJ(JNIEnv* env, jobject jobj, - jlong jwb_handle, - jbyteArray jkey, jint jkey_len, - jlong jcf_handle) { +void Java_org_rocksdb_WriteBatch_deleteJni__J_3BIJ(JNIEnv* env, jclass, + jlong jwb_handle, + jbyteArray jkey, + jint jkey_len, + jlong jcf_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto* cf_handle = @@ -309,7 +309,7 @@ void Java_org_rocksdb_WriteBatch_delete__J_3BIJ(JNIEnv* env, jobject jobj, return wb->Delete(cf_handle, key); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::k_op(remove, env, jobj, jkey, jkey_len); + ROCKSDB_NAMESPACE::JniUtil::k_op(remove, env, jkey, jkey_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -320,18 +320,17 @@ void Java_org_rocksdb_WriteBatch_delete__J_3BIJ(JNIEnv* env, jobject jobj, * Method: singleDelete * Signature: (J[BI)V */ -void Java_org_rocksdb_WriteBatch_singleDelete__J_3BI(JNIEnv* env, jobject jobj, - jlong jwb_handle, - jbyteArray jkey, - jint jkey_len) { +void Java_org_rocksdb_WriteBatch_singleDeleteJni__J_3BI(JNIEnv* env, jclass, + jlong jwb_handle, + jbyteArray jkey, + jint jkey_len) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto single_delete = [&wb](ROCKSDB_NAMESPACE::Slice key) { return wb->SingleDelete(key); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::k_op(single_delete, env, jobj, jkey, - jkey_len); + ROCKSDB_NAMESPACE::JniUtil::k_op(single_delete, env, jkey, jkey_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -342,11 +341,11 @@ void Java_org_rocksdb_WriteBatch_singleDelete__J_3BI(JNIEnv* env, jobject jobj, * Method: singleDelete * Signature: (J[BIJ)V */ -void Java_org_rocksdb_WriteBatch_singleDelete__J_3BIJ(JNIEnv* env, jobject jobj, - jlong jwb_handle, - jbyteArray jkey, - jint jkey_len, - jlong jcf_handle) { +void Java_org_rocksdb_WriteBatch_singleDeleteJni__J_3BIJ(JNIEnv* env, jclass, + jlong jwb_handle, + jbyteArray jkey, + jint jkey_len, + jlong jcf_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto* cf_handle = @@ -356,8 +355,7 @@ void Java_org_rocksdb_WriteBatch_singleDelete__J_3BIJ(JNIEnv* env, jobject jobj, return wb->SingleDelete(cf_handle, key); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::k_op(single_delete, env, jobj, jkey, - jkey_len); + ROCKSDB_NAMESPACE::JniUtil::k_op(single_delete, env, jkey, jkey_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -368,10 +366,11 @@ void Java_org_rocksdb_WriteBatch_singleDelete__J_3BIJ(JNIEnv* env, jobject jobj, * Method: deleteDirect * Signature: (JLjava/nio/ByteBuffer;IIJ)V */ -void Java_org_rocksdb_WriteBatch_deleteDirect(JNIEnv* env, jobject /*jobj*/, - jlong jwb_handle, jobject jkey, - jint jkey_offset, jint jkey_len, - jlong jcf_handle) { +void Java_org_rocksdb_WriteBatch_deleteDirectJni(JNIEnv* env, jclass /*jobj*/, + jlong jwb_handle, jobject jkey, + jint jkey_offset, + jint jkey_len, + jlong jcf_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto* cf_handle = @@ -392,8 +391,8 @@ void Java_org_rocksdb_WriteBatch_deleteDirect(JNIEnv* env, jobject /*jobj*/, * Method: deleteRange * Signature: (J[BI[BI)V */ -void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BI( - JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jbegin_key, +void Java_org_rocksdb_WriteBatch_deleteRangeJni__J_3BI_3BI( + JNIEnv* env, jclass, jlong jwb_handle, jbyteArray jbegin_key, jint jbegin_key_len, jbyteArray jend_key, jint jend_key_len) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -402,7 +401,7 @@ void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BI( return wb->DeleteRange(beginKey, endKey); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key, + ROCKSDB_NAMESPACE::JniUtil::kv_op(deleteRange, env, jbegin_key, jbegin_key_len, jend_key, jend_key_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); @@ -414,8 +413,8 @@ void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BI( * Method: deleteRange * Signature: (J[BI[BIJ)V */ -void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BIJ( - JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jbegin_key, +void Java_org_rocksdb_WriteBatch_deleteRangeJni__J_3BI_3BIJ( + JNIEnv* env, jclass, jlong jwb_handle, jbyteArray jbegin_key, jint jbegin_key_len, jbyteArray jend_key, jint jend_key_len, jlong jcf_handle) { auto* wb = reinterpret_cast(jwb_handle); @@ -428,7 +427,7 @@ void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BIJ( return wb->DeleteRange(cf_handle, beginKey, endKey); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key, + ROCKSDB_NAMESPACE::JniUtil::kv_op(deleteRange, env, jbegin_key, jbegin_key_len, jend_key, jend_key_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); @@ -440,16 +439,17 @@ void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BIJ( * Method: putLogData * Signature: (J[BI)V */ -void Java_org_rocksdb_WriteBatch_putLogData(JNIEnv* env, jobject jobj, - jlong jwb_handle, jbyteArray jblob, - jint jblob_len) { +void Java_org_rocksdb_WriteBatch_putLogDataJni(JNIEnv* env, jclass, + jlong jwb_handle, + jbyteArray jblob, + jint jblob_len) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto putLogData = [&wb](ROCKSDB_NAMESPACE::Slice blob) { return wb->PutLogData(blob); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len); + ROCKSDB_NAMESPACE::JniUtil::k_op(putLogData, env, jblob, jblob_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -460,7 +460,7 @@ void Java_org_rocksdb_WriteBatch_putLogData(JNIEnv* env, jobject jobj, * Method: iterate * Signature: (JJ)V */ -void Java_org_rocksdb_WriteBatch_iterate(JNIEnv* env, jobject /*jobj*/, +void Java_org_rocksdb_WriteBatch_iterate(JNIEnv* env, jclass /*jcls*/, jlong jwb_handle, jlong handlerHandle) { auto* wb = reinterpret_cast(jwb_handle); @@ -481,7 +481,7 @@ void Java_org_rocksdb_WriteBatch_iterate(JNIEnv* env, jobject /*jobj*/, * Method: data * Signature: (J)[B */ -jbyteArray Java_org_rocksdb_WriteBatch_data(JNIEnv* env, jobject /*jobj*/, +jbyteArray Java_org_rocksdb_WriteBatch_data(JNIEnv* env, jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -495,7 +495,7 @@ jbyteArray Java_org_rocksdb_WriteBatch_data(JNIEnv* env, jobject /*jobj*/, * Method: getDataSize * Signature: (J)J */ -jlong Java_org_rocksdb_WriteBatch_getDataSize(JNIEnv* /*env*/, jobject /*jobj*/, +jlong Java_org_rocksdb_WriteBatch_getDataSize(JNIEnv* /*env*/, jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -509,7 +509,7 @@ jlong Java_org_rocksdb_WriteBatch_getDataSize(JNIEnv* /*env*/, jobject /*jobj*/, * Method: hasPut * Signature: (J)Z */ -jboolean Java_org_rocksdb_WriteBatch_hasPut(JNIEnv* /*env*/, jobject /*jobj*/, +jboolean Java_org_rocksdb_WriteBatch_hasPut(JNIEnv* /*env*/, jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -522,8 +522,7 @@ jboolean Java_org_rocksdb_WriteBatch_hasPut(JNIEnv* /*env*/, jobject /*jobj*/, * Method: hasDelete * Signature: (J)Z */ -jboolean Java_org_rocksdb_WriteBatch_hasDelete(JNIEnv* /*env*/, - jobject /*jobj*/, +jboolean Java_org_rocksdb_WriteBatch_hasDelete(JNIEnv* /*env*/, jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -537,7 +536,7 @@ jboolean Java_org_rocksdb_WriteBatch_hasDelete(JNIEnv* /*env*/, * Signature: (J)Z */ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasSingleDelete( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -550,7 +549,7 @@ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasSingleDelete( * Signature: (J)Z */ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasDeleteRange( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -563,7 +562,7 @@ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasDeleteRange( * Signature: (J)Z */ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasMerge( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -576,7 +575,7 @@ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasMerge( * Signature: (J)Z */ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasBeginPrepare( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -589,7 +588,7 @@ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasBeginPrepare( * Signature: (J)Z */ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasEndPrepare( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -602,7 +601,7 @@ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasEndPrepare( * Signature: (J)Z */ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasCommit( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -615,7 +614,7 @@ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasCommit( * Signature: (J)Z */ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasRollback( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) { + JNIEnv* /*env*/, jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -628,7 +627,7 @@ JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasRollback( * Signature: (J)V */ void Java_org_rocksdb_WriteBatch_markWalTerminationPoint(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -642,7 +641,7 @@ void Java_org_rocksdb_WriteBatch_markWalTerminationPoint(JNIEnv* /*env*/, * Signature: (J)Lorg/rocksdb/WriteBatch/SavePoint; */ jobject Java_org_rocksdb_WriteBatch_getWalTerminationPoint(JNIEnv* env, - jobject /*jobj*/, + jclass /*jcls*/, jlong jwb_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -656,9 +655,9 @@ jobject Java_org_rocksdb_WriteBatch_getWalTerminationPoint(JNIEnv* env, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_WriteBatch_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_WriteBatch_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jobj*/, + jlong handle) { auto* wb = reinterpret_cast(handle); assert(wb != nullptr); delete wb; diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc index 30b9a722979..53f10998ca8 100644 --- a/java/rocksjni/write_batch_test.cc +++ b/java/rocksjni/write_batch_test.cc @@ -22,7 +22,6 @@ #include "rocksdb/status.h" #include "rocksdb/write_buffer_manager.h" #include "rocksjni/portal.h" -#include "table/scoped_arena_iterator.h" #include "test_util/testharness.h" #include "util/string_util.h" @@ -59,8 +58,9 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env, nullptr, nullptr); unsigned int count = 0; ROCKSDB_NAMESPACE::Arena arena; - ROCKSDB_NAMESPACE::ScopedArenaIterator iter( - mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(), &arena)); + ROCKSDB_NAMESPACE::ScopedArenaPtr iter( + mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(), + /*seqno_to_time_mapping=*/nullptr, &arena)); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ROCKSDB_NAMESPACE::ParsedInternalKey ikey; ikey.clear(); diff --git a/java/rocksjni/write_batch_with_index.cc b/java/rocksjni/write_batch_with_index.cc index a5c3216cb32..7d6e8b1a293 100644 --- a/java/rocksjni/write_batch_with_index.cc +++ b/java/rocksjni/write_batch_with_index.cc @@ -72,9 +72,9 @@ jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__JBIZ( * Method: count0 * Signature: (J)I */ -jint Java_org_rocksdb_WriteBatchWithIndex_count0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jwbwi_handle) { +jint Java_org_rocksdb_WriteBatchWithIndex_count0Jni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jwbwi_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -87,9 +87,9 @@ jint Java_org_rocksdb_WriteBatchWithIndex_count0(JNIEnv* /*env*/, * Method: put * Signature: (J[BI[BI)V */ -void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BI( - JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, - jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) { +void Java_org_rocksdb_WriteBatchWithIndex_putJni__J_3BI_3BI( + JNIEnv* env, jclass, jlong jwbwi_handle, jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -98,8 +98,8 @@ void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BI( return wbwi->Put(key, value); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, - jentry_value, jentry_value_len); + ROCKSDB_NAMESPACE::JniUtil::kv_op(put, env, jkey, jkey_len, jentry_value, + jentry_value_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -110,10 +110,9 @@ void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BI( * Method: put * Signature: (J[BI[BIJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BIJ( - JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, - jint jkey_len, jbyteArray jentry_value, jint jentry_value_len, - jlong jcf_handle) { +void Java_org_rocksdb_WriteBatchWithIndex_putJni__J_3BI_3BIJ( + JNIEnv* env, jclass, jlong jwbwi_handle, jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -125,8 +124,8 @@ void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BIJ( return wbwi->Put(cf_handle, key, value); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, - jentry_value, jentry_value_len); + ROCKSDB_NAMESPACE::JniUtil::kv_op(put, env, jkey, jkey_len, jentry_value, + jentry_value_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -137,8 +136,8 @@ void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BIJ( * Method: putDirect * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_putDirect( - JNIEnv* env, jobject /*jobj*/, jlong jwb_handle, jobject jkey, +void Java_org_rocksdb_WriteBatchWithIndex_putDirectJni( + JNIEnv* env, jclass /*jobj*/, jlong jwb_handle, jobject jkey, jint jkey_offset, jint jkey_len, jobject jval, jint jval_offset, jint jval_len, jlong jcf_handle) { auto* wb = reinterpret_cast(jwb_handle); @@ -162,9 +161,9 @@ void Java_org_rocksdb_WriteBatchWithIndex_putDirect( * Method: merge * Signature: (J[BI[BI)V */ -void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BI( - JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, - jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) { +void Java_org_rocksdb_WriteBatchWithIndex_mergeJni__J_3BI_3BI( + JNIEnv* env, jclass, jlong jwbwi_handle, jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -173,7 +172,7 @@ void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BI( return wbwi->Merge(key, value); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, + ROCKSDB_NAMESPACE::JniUtil::kv_op(merge, env, jkey, jkey_len, jentry_value, jentry_value_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); @@ -185,10 +184,9 @@ void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BI( * Method: merge * Signature: (J[BI[BIJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BIJ( - JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, - jint jkey_len, jbyteArray jentry_value, jint jentry_value_len, - jlong jcf_handle) { +void Java_org_rocksdb_WriteBatchWithIndex_mergeJni__J_3BI_3BIJ( + JNIEnv* env, jclass, jlong jwbwi_handle, jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -200,7 +198,7 @@ void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BIJ( return wbwi->Merge(cf_handle, key, value); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, + ROCKSDB_NAMESPACE::JniUtil::kv_op(merge, env, jkey, jkey_len, jentry_value, jentry_value_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); @@ -212,11 +210,10 @@ void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BIJ( * Method: delete * Signature: (J[BI)V */ -void Java_org_rocksdb_WriteBatchWithIndex_delete__J_3BI(JNIEnv* env, - jobject jobj, - jlong jwbwi_handle, - jbyteArray jkey, - jint jkey_len) { +void Java_org_rocksdb_WriteBatchWithIndex_deleteJni__J_3BI(JNIEnv* env, jclass, + jlong jwbwi_handle, + jbyteArray jkey, + jint jkey_len) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -224,7 +221,7 @@ void Java_org_rocksdb_WriteBatchWithIndex_delete__J_3BI(JNIEnv* env, return wbwi->Delete(key); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::k_op(remove, env, jobj, jkey, jkey_len); + ROCKSDB_NAMESPACE::JniUtil::k_op(remove, env, jkey, jkey_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -235,9 +232,11 @@ void Java_org_rocksdb_WriteBatchWithIndex_delete__J_3BI(JNIEnv* env, * Method: delete * Signature: (J[BIJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_delete__J_3BIJ( - JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, - jint jkey_len, jlong jcf_handle) { +void Java_org_rocksdb_WriteBatchWithIndex_deleteJni__J_3BIJ(JNIEnv* env, jclass, + jlong jwbwi_handle, + jbyteArray jkey, + jint jkey_len, + jlong jcf_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -248,7 +247,7 @@ void Java_org_rocksdb_WriteBatchWithIndex_delete__J_3BIJ( return wbwi->Delete(cf_handle, key); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::k_op(remove, env, jobj, jkey, jkey_len); + ROCKSDB_NAMESPACE::JniUtil::k_op(remove, env, jkey, jkey_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -259,9 +258,8 @@ void Java_org_rocksdb_WriteBatchWithIndex_delete__J_3BIJ( * Method: singleDelete * Signature: (J[BI)V */ -void Java_org_rocksdb_WriteBatchWithIndex_singleDelete__J_3BI( - JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, - jint jkey_len) { +void Java_org_rocksdb_WriteBatchWithIndex_singleDeleteJni__J_3BI( + JNIEnv* env, jclass, jlong jwbwi_handle, jbyteArray jkey, jint jkey_len) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -269,8 +267,7 @@ void Java_org_rocksdb_WriteBatchWithIndex_singleDelete__J_3BI( return wbwi->SingleDelete(key); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::k_op(single_delete, env, jobj, jkey, - jkey_len); + ROCKSDB_NAMESPACE::JniUtil::k_op(single_delete, env, jkey, jkey_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -281,9 +278,9 @@ void Java_org_rocksdb_WriteBatchWithIndex_singleDelete__J_3BI( * Method: singleDelete * Signature: (J[BIJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_singleDelete__J_3BIJ( - JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, - jint jkey_len, jlong jcf_handle) { +void Java_org_rocksdb_WriteBatchWithIndex_singleDeleteJni__J_3BIJ( + JNIEnv* env, jclass, jlong jwbwi_handle, jbyteArray jkey, jint jkey_len, + jlong jcf_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -294,8 +291,7 @@ void Java_org_rocksdb_WriteBatchWithIndex_singleDelete__J_3BIJ( return wbwi->SingleDelete(cf_handle, key); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::k_op(single_delete, env, jobj, jkey, - jkey_len); + ROCKSDB_NAMESPACE::JniUtil::k_op(single_delete, env, jkey, jkey_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -306,8 +302,8 @@ void Java_org_rocksdb_WriteBatchWithIndex_singleDelete__J_3BIJ( * Method: deleteDirect * Signature: (JLjava/nio/ByteBuffer;IIJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_deleteDirect( - JNIEnv* env, jobject /*jobj*/, jlong jwb_handle, jobject jkey, +void Java_org_rocksdb_WriteBatchWithIndex_deleteDirectJni( + JNIEnv* env, jclass /*jobj*/, jlong jwb_handle, jobject jkey, jint jkey_offset, jint jkey_len, jlong jcf_handle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -329,8 +325,8 @@ void Java_org_rocksdb_WriteBatchWithIndex_deleteDirect( * Method: deleteRange * Signature: (J[BI[BI)V */ -void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BI( - JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jbegin_key, +void Java_org_rocksdb_WriteBatchWithIndex_deleteRangeJni__J_3BI_3BI( + JNIEnv* env, jclass, jlong jwbwi_handle, jbyteArray jbegin_key, jint jbegin_key_len, jbyteArray jend_key, jint jend_key_len) { auto* wbwi = reinterpret_cast(jwbwi_handle); @@ -340,7 +336,7 @@ void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BI( return wbwi->DeleteRange(beginKey, endKey); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key, + ROCKSDB_NAMESPACE::JniUtil::kv_op(deleteRange, env, jbegin_key, jbegin_key_len, jend_key, jend_key_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); @@ -352,8 +348,8 @@ void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BI( * Method: deleteRange * Signature: (J[BI[BIJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BIJ( - JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jbegin_key, +void Java_org_rocksdb_WriteBatchWithIndex_deleteRangeJni__J_3BI_3BIJ( + JNIEnv* env, jclass, jlong jwbwi_handle, jbyteArray jbegin_key, jint jbegin_key_len, jbyteArray jend_key, jint jend_key_len, jlong jcf_handle) { auto* wbwi = @@ -367,7 +363,7 @@ void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BIJ( return wbwi->DeleteRange(cf_handle, beginKey, endKey); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key, + ROCKSDB_NAMESPACE::JniUtil::kv_op(deleteRange, env, jbegin_key, jbegin_key_len, jend_key, jend_key_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); @@ -379,10 +375,10 @@ void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BIJ( * Method: putLogData * Signature: (J[BI)V */ -void Java_org_rocksdb_WriteBatchWithIndex_putLogData(JNIEnv* env, jobject jobj, - jlong jwbwi_handle, - jbyteArray jblob, - jint jblob_len) { +void Java_org_rocksdb_WriteBatchWithIndex_putLogDataJni(JNIEnv* env, jclass, + jlong jwbwi_handle, + jbyteArray jblob, + jint jblob_len) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -390,7 +386,7 @@ void Java_org_rocksdb_WriteBatchWithIndex_putLogData(JNIEnv* env, jobject jobj, return wbwi->PutLogData(blob); }; std::unique_ptr status = - ROCKSDB_NAMESPACE::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len); + ROCKSDB_NAMESPACE::JniUtil::k_op(putLogData, env, jblob, jblob_len); if (status != nullptr && !status->ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status); } @@ -401,9 +397,9 @@ void Java_org_rocksdb_WriteBatchWithIndex_putLogData(JNIEnv* env, jobject jobj, * Method: clear * Signature: (J)V */ -void Java_org_rocksdb_WriteBatchWithIndex_clear0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jwbwi_handle) { +void Java_org_rocksdb_WriteBatchWithIndex_clear0Jni(JNIEnv* /*env*/, + jclass /*jobj*/, + jlong jwbwi_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -416,9 +412,9 @@ void Java_org_rocksdb_WriteBatchWithIndex_clear0(JNIEnv* /*env*/, * Method: setSavePoint0 * Signature: (J)V */ -void Java_org_rocksdb_WriteBatchWithIndex_setSavePoint0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jwbwi_handle) { +void Java_org_rocksdb_WriteBatchWithIndex_setSavePoint0Jni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jwbwi_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -431,8 +427,8 @@ void Java_org_rocksdb_WriteBatchWithIndex_setSavePoint0(JNIEnv* /*env*/, * Method: rollbackToSavePoint0 * Signature: (J)V */ -void Java_org_rocksdb_WriteBatchWithIndex_rollbackToSavePoint0( - JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle) { +void Java_org_rocksdb_WriteBatchWithIndex_rollbackToSavePoint0Jni( + JNIEnv* env, jclass /*jobj*/, jlong jwbwi_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -451,9 +447,9 @@ void Java_org_rocksdb_WriteBatchWithIndex_rollbackToSavePoint0( * Method: popSavePoint * Signature: (J)V */ -void Java_org_rocksdb_WriteBatchWithIndex_popSavePoint(JNIEnv* env, - jobject /*jobj*/, - jlong jwbwi_handle) { +void Java_org_rocksdb_WriteBatchWithIndex_popSavePointJni(JNIEnv* env, + jclass /*jobj*/, + jlong jwbwi_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -472,10 +468,10 @@ void Java_org_rocksdb_WriteBatchWithIndex_popSavePoint(JNIEnv* env, * Method: setMaxBytes * Signature: (JJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_setMaxBytes(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jwbwi_handle, - jlong jmax_bytes) { +void Java_org_rocksdb_WriteBatchWithIndex_setMaxBytesJni(JNIEnv* /*env*/, + jclass /*cls*/, + jlong jwbwi_handle, + jlong jmax_bytes) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -488,9 +484,8 @@ void Java_org_rocksdb_WriteBatchWithIndex_setMaxBytes(JNIEnv* /*env*/, * Method: getWriteBatch * Signature: (J)Lorg/rocksdb/WriteBatch; */ -jobject Java_org_rocksdb_WriteBatchWithIndex_getWriteBatch(JNIEnv* env, - jobject /*jobj*/, - jlong jwbwi_handle) { +jobject Java_org_rocksdb_WriteBatchWithIndex_getWriteBatchJni( + JNIEnv* env, jclass /*jobj*/, jlong jwbwi_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); @@ -507,7 +502,7 @@ jobject Java_org_rocksdb_WriteBatchWithIndex_getWriteBatch(JNIEnv* env, * Signature: (J)J */ jlong Java_org_rocksdb_WriteBatchWithIndex_iterator0(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jwbwi_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); @@ -521,7 +516,7 @@ jlong Java_org_rocksdb_WriteBatchWithIndex_iterator0(JNIEnv* /*env*/, * Signature: (JJ)J */ jlong Java_org_rocksdb_WriteBatchWithIndex_iterator1(JNIEnv* /*env*/, - jobject /*jobj*/, + jclass /*jcls*/, jlong jwbwi_handle, jlong jcf_handle) { auto* wbwi = @@ -538,7 +533,7 @@ jlong Java_org_rocksdb_WriteBatchWithIndex_iterator1(JNIEnv* /*env*/, * Signature: (JJJJ)J */ jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase( - JNIEnv*, jobject, jlong jwbwi_handle, jlong jcf_handle, + JNIEnv*, jclass, jlong jwbwi_handle, jlong jcf_handle, jlong jbase_iterator_handle, jlong jread_opts_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); @@ -562,7 +557,7 @@ jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase( * Signature: (JJ[BI)[B */ jbyteArray JNICALL Java_org_rocksdb_WriteBatchWithIndex_getFromBatch__JJ_3BI( - JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle, jlong jdbopt_handle, + JNIEnv* env, jclass /*jcls*/, jlong jwbwi_handle, jlong jdbopt_handle, jbyteArray jkey, jint jkey_len) { auto* wbwi = reinterpret_cast(jwbwi_handle); @@ -582,7 +577,7 @@ jbyteArray JNICALL Java_org_rocksdb_WriteBatchWithIndex_getFromBatch__JJ_3BI( * Signature: (JJ[BIJ)[B */ jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatch__JJ_3BIJ( - JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle, jlong jdbopt_handle, + JNIEnv* env, jclass /*jcls*/, jlong jwbwi_handle, jlong jdbopt_handle, jbyteArray jkey, jint jkey_len, jlong jcf_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); @@ -604,7 +599,7 @@ jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatch__JJ_3BIJ( * Signature: (JJJ[BI)[B */ jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatchAndDB__JJJ_3BI( - JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle, jlong jdb_handle, + JNIEnv* env, jclass /*jcls*/, jlong jwbwi_handle, jlong jdb_handle, jlong jreadopt_handle, jbyteArray jkey, jint jkey_len) { auto* wbwi = reinterpret_cast(jwbwi_handle); @@ -626,7 +621,7 @@ jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatchAndDB__JJJ_3BI( * Signature: (JJJ[BIJ)[B */ jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatchAndDB__JJJ_3BIJ( - JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle, jlong jdb_handle, + JNIEnv* env, jclass /*jcls*/, jlong jwbwi_handle, jlong jdb_handle, jlong jreadopt_handle, jbyteArray jkey, jint jkey_len, jlong jcf_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); @@ -649,9 +644,9 @@ jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatchAndDB__JJJ_3BIJ( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_WriteBatchWithIndex_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_WriteBatchWithIndex_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong handle) { auto* wbwi = reinterpret_cast(handle); assert(wbwi != nullptr); @@ -665,9 +660,9 @@ void Java_org_rocksdb_WriteBatchWithIndex_disposeInternal(JNIEnv* /*env*/, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_WBWIRocksIterator_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_WBWIRocksIterator_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jobj*/, + jlong handle) { auto* it = reinterpret_cast(handle); assert(it != nullptr); delete it; @@ -678,9 +673,9 @@ void Java_org_rocksdb_WBWIRocksIterator_disposeInternal(JNIEnv* /*env*/, * Method: isValid0 * Signature: (J)Z */ -jboolean Java_org_rocksdb_WBWIRocksIterator_isValid0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +jboolean Java_org_rocksdb_WBWIRocksIterator_isValid0Jni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong handle) { return reinterpret_cast(handle)->Valid(); } @@ -689,9 +684,9 @@ jboolean Java_org_rocksdb_WBWIRocksIterator_isValid0(JNIEnv* /*env*/, * Method: seekToFirst0 * Signature: (J)V */ -void Java_org_rocksdb_WBWIRocksIterator_seekToFirst0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_WBWIRocksIterator_seekToFirst0Jni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong handle) { reinterpret_cast(handle)->SeekToFirst(); } @@ -700,9 +695,9 @@ void Java_org_rocksdb_WBWIRocksIterator_seekToFirst0(JNIEnv* /*env*/, * Method: seekToLast0 * Signature: (J)V */ -void Java_org_rocksdb_WBWIRocksIterator_seekToLast0(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_WBWIRocksIterator_seekToLast0Jni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong handle) { reinterpret_cast(handle)->SeekToLast(); } @@ -711,8 +706,9 @@ void Java_org_rocksdb_WBWIRocksIterator_seekToLast0(JNIEnv* /*env*/, * Method: next0 * Signature: (J)V */ -void Java_org_rocksdb_WBWIRocksIterator_next0(JNIEnv* /*env*/, jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_WBWIRocksIterator_next0Jni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong handle) { reinterpret_cast(handle)->Next(); } @@ -721,8 +717,9 @@ void Java_org_rocksdb_WBWIRocksIterator_next0(JNIEnv* /*env*/, jobject /*jobj*/, * Method: prev0 * Signature: (J)V */ -void Java_org_rocksdb_WBWIRocksIterator_prev0(JNIEnv* /*env*/, jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_WBWIRocksIterator_prev0Jni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong handle) { reinterpret_cast(handle)->Prev(); } @@ -731,9 +728,10 @@ void Java_org_rocksdb_WBWIRocksIterator_prev0(JNIEnv* /*env*/, jobject /*jobj*/, * Method: seek0 * Signature: (J[BI)V */ -void Java_org_rocksdb_WBWIRocksIterator_seek0(JNIEnv* env, jobject /*jobj*/, - jlong handle, jbyteArray jtarget, - jint jtarget_len) { +void Java_org_rocksdb_WBWIRocksIterator_seek0Jni(JNIEnv* env, jclass /*jcls*/, + jlong handle, + jbyteArray jtarget, + jint jtarget_len) { auto* it = reinterpret_cast(handle); jbyte* target = new jbyte[jtarget_len]; env->GetByteArrayRegion(jtarget, 0, jtarget_len, target); @@ -756,8 +754,8 @@ void Java_org_rocksdb_WBWIRocksIterator_seek0(JNIEnv* env, jobject /*jobj*/, * Method: seekDirect0 * Signature: (JLjava/nio/ByteBuffer;II)V */ -void Java_org_rocksdb_WBWIRocksIterator_seekDirect0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget, +void Java_org_rocksdb_WBWIRocksIterator_seekDirect0Jni( + JNIEnv* env, jclass /*jcls*/, jlong handle, jobject jtarget, jint jtarget_off, jint jtarget_len) { auto* it = reinterpret_cast(handle); auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) { @@ -775,8 +773,8 @@ void Java_org_rocksdb_WBWIRocksIterator_seekDirect0( * Method: seekByteArray0 * Signature: (J[BII)V */ -void Java_org_rocksdb_WBWIRocksIterator_seekByteArray0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jtarget, +void Java_org_rocksdb_WBWIRocksIterator_seekByteArray0Jni( + JNIEnv* env, jclass /*jcls*/, jlong handle, jbyteArray jtarget, jint jtarget_off, jint jtarget_len) { const std::unique_ptr target(new char[jtarget_len]); if (target == nullptr) { @@ -799,11 +797,11 @@ void Java_org_rocksdb_WBWIRocksIterator_seekByteArray0( * Method: seekForPrev0 * Signature: (J[BI)V */ -void Java_org_rocksdb_WBWIRocksIterator_seekForPrev0(JNIEnv* env, - jobject /*jobj*/, - jlong handle, - jbyteArray jtarget, - jint jtarget_len) { +void Java_org_rocksdb_WBWIRocksIterator_seekForPrev0Jni(JNIEnv* env, + jclass /*jcls*/, + jlong handle, + jbyteArray jtarget, + jint jtarget_len) { auto* it = reinterpret_cast(handle); jbyte* target = new jbyte[jtarget_len]; env->GetByteArrayRegion(jtarget, 0, jtarget_len, target); @@ -826,8 +824,8 @@ void Java_org_rocksdb_WBWIRocksIterator_seekForPrev0(JNIEnv* env, * Method: seekForPrevDirect0 * Signature: (JLjava/nio/ByteBuffer;II)V */ -void Java_org_rocksdb_WBWIRocksIterator_seekForPrevDirect0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget, +void Java_org_rocksdb_WBWIRocksIterator_seekForPrevDirect0Jni( + JNIEnv* env, jclass /*jcls*/, jlong handle, jobject jtarget, jint jtarget_off, jint jtarget_len) { auto* it = reinterpret_cast(handle); auto seek_for_prev = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) { @@ -845,8 +843,8 @@ void Java_org_rocksdb_WBWIRocksIterator_seekForPrevDirect0( * Method: seekForPrevByteArray0 * Signature: (J[BII)V */ -void Java_org_rocksdb_WBWIRocksIterator_seekForPrevByteArray0( - JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jtarget, +void Java_org_rocksdb_WBWIRocksIterator_seekForPrevByteArray0Jni( + JNIEnv* env, jclass /*jcls*/, jlong handle, jbyteArray jtarget, jint jtarget_off, jint jtarget_len) { const std::unique_ptr target(new char[jtarget_len]); if (target == nullptr) { @@ -869,8 +867,8 @@ void Java_org_rocksdb_WBWIRocksIterator_seekForPrevByteArray0( * Method: status0 * Signature: (J)V */ -void Java_org_rocksdb_WBWIRocksIterator_status0(JNIEnv* env, jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_WBWIRocksIterator_status0Jni(JNIEnv* env, jclass /*jcls*/, + jlong handle) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Status s = it->status(); @@ -887,7 +885,7 @@ void Java_org_rocksdb_WBWIRocksIterator_status0(JNIEnv* env, jobject /*jobj*/, * Signature: (J)[J */ jlongArray Java_org_rocksdb_WBWIRocksIterator_entry1(JNIEnv* env, - jobject /*jobj*/, + jclass /*jobj*/, jlong handle) { auto* it = reinterpret_cast(handle); const ROCKSDB_NAMESPACE::WriteEntry& we = it->Entry(); @@ -946,8 +944,23 @@ jlongArray Java_org_rocksdb_WBWIRocksIterator_entry1(JNIEnv* env, * Method: refresh0 * Signature: (J)V */ -void Java_org_rocksdb_WBWIRocksIterator_refresh0(JNIEnv* env) { +void Java_org_rocksdb_WBWIRocksIterator_refresh0Jni(JNIEnv* env, + jobject /*jobj*/, + jlong /*handle*/) { ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::Status::NotSupported("Refresh() is not supported"); ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); } + +/* + * Class: org_rocksdb_WBWIRocksIterator + * Method: refresh1 + * Signature: (JJ)V + */ +void Java_org_rocksdb_WBWIRocksIterator_refresh1(JNIEnv* env, jobject /*jobj*/, + jlong /*handle*/, + jlong /*snapshot_handle*/) { + ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::Status::NotSupported( + "Refresh(Snapshot*) is not supported"); + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); +} diff --git a/java/rocksjni/write_buffer_manager.cc b/java/rocksjni/write_buffer_manager.cc index 9ce697e10ab..8dd82063325 100644 --- a/java/rocksjni/write_buffer_manager.cc +++ b/java/rocksjni/write_buffer_manager.cc @@ -36,9 +36,9 @@ jlong Java_org_rocksdb_WriteBufferManager_newWriteBufferManager( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_WriteBufferManager_disposeInternal(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jhandle) { +void Java_org_rocksdb_WriteBufferManager_disposeInternalJni(JNIEnv* /*env*/, + jclass /*jcls*/, + jlong jhandle) { auto* write_buffer_manager = reinterpret_cast*>( jhandle); diff --git a/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java b/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java index 728cda8c1d4..c10fb8a2a9b 100644 --- a/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java +++ b/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java @@ -74,5 +74,5 @@ protected void disposeInternal() { } private native long createNewCompactionFilterFactory0(); - private native void disposeInternal(final long handle); + private static native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/AbstractComparator.java b/java/src/main/java/org/rocksdb/AbstractComparator.java index 83e0f067601..5cb33c812d1 100644 --- a/java/src/main/java/org/rocksdb/AbstractComparator.java +++ b/java/src/main/java/org/rocksdb/AbstractComparator.java @@ -118,7 +118,7 @@ public final boolean usingDirectBuffers() { return usingDirectBuffers(nativeHandle_); } - private native boolean usingDirectBuffers(final long nativeHandle); + private static native boolean usingDirectBuffers(final long nativeHandle); private native long createNewComparator(final long comparatorOptionsHandle); } diff --git a/java/src/main/java/org/rocksdb/AbstractRocksIterator.java b/java/src/main/java/org/rocksdb/AbstractRocksIterator.java index 1aade1b8982..b7af848f0c5 100644 --- a/java/src/main/java/org/rocksdb/AbstractRocksIterator.java +++ b/java/src/main/java/org/rocksdb/AbstractRocksIterator.java @@ -108,6 +108,12 @@ public void refresh() throws RocksDBException { refresh0(nativeHandle_); } + @Override + public void refresh(final Snapshot snapshot) throws RocksDBException { + assert (isOwningHandle()); + refresh1(nativeHandle_, snapshot.getNativeHandle()); + } + @Override public void status() throws RocksDBException { assert (isOwningHandle()); @@ -135,6 +141,7 @@ protected void disposeInternal() { abstract void next0(long handle); abstract void prev0(long handle); abstract void refresh0(long handle) throws RocksDBException; + abstract void refresh1(long handle, long snapshotHandle) throws RocksDBException; abstract void seek0(long handle, byte[] target, int targetLen); abstract void seekForPrev0(long handle, byte[] target, int targetLen); abstract void seekDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen); diff --git a/java/src/main/java/org/rocksdb/AbstractSlice.java b/java/src/main/java/org/rocksdb/AbstractSlice.java index f321b9910ae..a73d9c644f1 100644 --- a/java/src/main/java/org/rocksdb/AbstractSlice.java +++ b/java/src/main/java/org/rocksdb/AbstractSlice.java @@ -175,11 +175,11 @@ public boolean startsWith(final AbstractSlice prefix) { } protected static native long createNewSliceFromString(final String str); - private native int size0(long handle); - private native boolean empty0(long handle); - private native String toString0(long handle, boolean hex); - private native int compare0(long handle, long otherHandle); - private native boolean startsWith0(long handle, long otherHandle); + private static native int size0(long handle); + private static native boolean empty0(long handle); + private static native String toString0(long handle, boolean hex); + private static native int compare0(long handle, long otherHandle); + private static native boolean startsWith0(long handle, long otherHandle); /** * Deletes underlying C++ slice pointer. @@ -188,6 +188,9 @@ public boolean startsWith(final AbstractSlice prefix) { * Otherwise, an undefined behavior will occur. */ @Override - protected final native void disposeInternal(final long handle); + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java b/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java index b117e5cc2ad..40caaa0854c 100644 --- a/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java +++ b/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java @@ -50,5 +50,9 @@ protected long initializeNative(final long... nativeParameterHandles) { protected void disposeInternal() { disposeInternal(nativeHandle_); } - protected final native void disposeInternal(final long handle); + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/AccessHint.java b/java/src/main/java/org/rocksdb/AccessHint.java deleted file mode 100644 index b7ccadd84a6..00000000000 --- a/java/src/main/java/org/rocksdb/AccessHint.java +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -package org.rocksdb; - -/** - * File access pattern once a compaction has started - */ -@Deprecated -public enum AccessHint { - NONE((byte)0x0), - NORMAL((byte)0x1), - SEQUENTIAL((byte)0x2), - WILLNEED((byte)0x3); - - private final byte value; - - AccessHint(final byte value) { - this.value = value; - } - - /** - *

Returns the byte value of the enumerations value.

- * - * @return byte representation - */ - public byte getValue() { - return value; - } - - /** - *

Get the AccessHint enumeration value by - * passing the byte identifier to this method.

- * - * @param byteIdentifier of AccessHint. - * - * @return AccessHint instance. - * - * @throws IllegalArgumentException if the access hint for the byteIdentifier - * cannot be found - */ - public static AccessHint getAccessHint(final byte byteIdentifier) { - for (final AccessHint accessHint : AccessHint.values()) { - if (accessHint.getValue() == byteIdentifier) { - return accessHint; - } - } - - throw new IllegalArgumentException( - "Illegal value provided for AccessHint."); - } -} diff --git a/java/src/main/java/org/rocksdb/BackupEngine.java b/java/src/main/java/org/rocksdb/BackupEngine.java index 3ab2206830f..4ee675ad758 100644 --- a/java/src/main/java/org/rocksdb/BackupEngine.java +++ b/java/src/main/java/org/rocksdb/BackupEngine.java @@ -229,31 +229,35 @@ public void restoreDbFromLatestBackup( private static native long open(final long env, final long backupEngineOptions) throws RocksDBException; - private native void createNewBackup(final long handle, final long dbHandle, + private static native void createNewBackup(final long handle, final long dbHandle, final boolean flushBeforeBackup) throws RocksDBException; - private native void createNewBackupWithMetadata(final long handle, final long dbHandle, + private static native void createNewBackupWithMetadata(final long handle, final long dbHandle, final String metadata, final boolean flushBeforeBackup) throws RocksDBException; - private native List getBackupInfo(final long handle); + private static native List getBackupInfo(final long handle); - private native int[] getCorruptedBackups(final long handle); + private static native int[] getCorruptedBackups(final long handle); - private native void garbageCollect(final long handle) throws RocksDBException; + private static native void garbageCollect(final long handle) throws RocksDBException; - private native void purgeOldBackups(final long handle, - final int numBackupsToKeep) throws RocksDBException; - - private native void deleteBackup(final long handle, final int backupId) + private static native void purgeOldBackups(final long handle, final int numBackupsToKeep) throws RocksDBException; - private native void restoreDbFromBackup(final long handle, final int backupId, - final String dbDir, final String walDir, final long restoreOptionsHandle) + private static native void deleteBackup(final long handle, final int backupId) throws RocksDBException; - private native void restoreDbFromLatestBackup(final long handle, + private static native void restoreDbFromBackup(final long handle, final int backupId, final String dbDir, final String walDir, final long restoreOptionsHandle) throws RocksDBException; - @Override protected final native void disposeInternal(final long handle); + private static native void restoreDbFromLatestBackup(final long handle, final String dbDir, + final String walDir, final long restoreOptionsHandle) throws RocksDBException; + + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/BackupEngineOptions.java b/java/src/main/java/org/rocksdb/BackupEngineOptions.java index 7747b944f91..cefce7e19dc 100644 --- a/java/src/main/java/org/rocksdb/BackupEngineOptions.java +++ b/java/src/main/java/org/rocksdb/BackupEngineOptions.java @@ -426,31 +426,35 @@ public long callbackTriggerIntervalSize() { } private static native long newBackupEngineOptions(final String path); - private native String backupDir(long handle); - private native void setBackupEnv(final long handle, final long envHandle); - private native void setShareTableFiles(long handle, boolean flag); - private native boolean shareTableFiles(long handle); - private native void setInfoLog(final long handle, final long infoLogHandle); - private native void setSync(long handle, boolean flag); - private native boolean sync(long handle); - private native void setDestroyOldData(long handle, boolean flag); - private native boolean destroyOldData(long handle); - private native void setBackupLogFiles(long handle, boolean flag); - private native boolean backupLogFiles(long handle); - private native void setBackupRateLimit(long handle, long rateLimit); - private native long backupRateLimit(long handle); - private native void setBackupRateLimiter(long handle, long rateLimiterHandle); - private native void setRestoreRateLimit(long handle, long rateLimit); - private native long restoreRateLimit(long handle); - private native void setRestoreRateLimiter(final long handle, - final long rateLimiterHandle); - private native void setShareFilesWithChecksum(long handle, boolean flag); - private native boolean shareFilesWithChecksum(long handle); - private native void setMaxBackgroundOperations(final long handle, - final int maxBackgroundOperations); - private native int maxBackgroundOperations(final long handle); - private native void setCallbackTriggerIntervalSize(final long handle, - long callbackTriggerIntervalSize); - private native long callbackTriggerIntervalSize(final long handle); - @Override protected final native void disposeInternal(final long handle); + private static native String backupDir(long handle); + private static native void setBackupEnv(final long handle, final long envHandle); + private static native void setShareTableFiles(long handle, boolean flag); + private static native boolean shareTableFiles(long handle); + private static native void setInfoLog(final long handle, final long infoLogHandle); + private static native void setSync(long handle, boolean flag); + private static native boolean sync(long handle); + private static native void setDestroyOldData(long handle, boolean flag); + private static native boolean destroyOldData(long handle); + private static native void setBackupLogFiles(long handle, boolean flag); + private static native boolean backupLogFiles(long handle); + private static native void setBackupRateLimit(long handle, long rateLimit); + private static native long backupRateLimit(long handle); + private static native void setBackupRateLimiter(long handle, long rateLimiterHandle); + private static native void setRestoreRateLimit(long handle, long rateLimit); + private static native long restoreRateLimit(long handle); + private static native void setRestoreRateLimiter(final long handle, final long rateLimiterHandle); + private static native void setShareFilesWithChecksum(long handle, boolean flag); + private static native boolean shareFilesWithChecksum(long handle); + private static native void setMaxBackgroundOperations( + final long handle, final int maxBackgroundOperations); + private static native int maxBackgroundOperations(final long handle); + private static native void setCallbackTriggerIntervalSize( + final long handle, long callbackTriggerIntervalSize); + private static native long callbackTriggerIntervalSize(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java index c82c3ea10ee..d066131458c 100644 --- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java +++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java @@ -37,7 +37,7 @@ public BlockBasedTableConfig() { wholeKeyFiltering = true; verifyCompression = false; readAmpBytesPerBit = 0; - formatVersion = 5; + formatVersion = 6; enableIndexCompression = true; blockAlign = false; indexShortening = IndexShorteningMode.kShortenSeparators; @@ -949,7 +949,7 @@ public BlockBasedTableConfig setHashIndexAllowCollision( indexShortening.getValue(), blockCacheSize, blockCacheNumShardBits); } - private native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlocks, + private static native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlocks, final boolean cacheIndexAndFilterBlocksWithHighPriority, final boolean pinL0FilterAndIndexBlocksInCache, final boolean pinTopLevelIndexAndFilter, final byte indexTypeValue, final byte dataBlockIndexTypeValue, diff --git a/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java b/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java index 732faee207a..cdb82ee4347 100644 --- a/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java +++ b/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java @@ -21,5 +21,10 @@ public CassandraValueMergeOperator(final int gcGracePeriodInSeconds, final int o private static native long newSharedCassandraValueMergeOperator( int gcGracePeriodInSeconds, int limit); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/Checkpoint.java b/java/src/main/java/org/rocksdb/Checkpoint.java index 347221df6ed..e50068a6e32 100644 --- a/java/src/main/java/org/rocksdb/Checkpoint.java +++ b/java/src/main/java/org/rocksdb/Checkpoint.java @@ -61,9 +61,13 @@ private Checkpoint(final RocksDB db) { } private static native long newCheckpoint(long dbHandle); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); - private native void createCheckpoint(long handle, String checkpointPath) + private static native void createCheckpoint(long handle, String checkpointPath) throws RocksDBException; private native long exportColumnFamily(long handle, long columnFamilyHandle, String exportPath) diff --git a/java/src/main/java/org/rocksdb/ClockCache.java b/java/src/main/java/org/rocksdb/ClockCache.java index f9f6da74c08..afbd7f75532 100644 --- a/java/src/main/java/org/rocksdb/ClockCache.java +++ b/java/src/main/java/org/rocksdb/ClockCache.java @@ -65,5 +65,9 @@ public ClockCache(final long capacity, final int numShardBits, private static native long newClockCache( final long capacity, final int numShardBits, final boolean strictCapacityLimit); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java index 9fd63e76805..00bff0b0730 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java @@ -142,10 +142,15 @@ protected void disposeInternal() { } } - private native byte[] getName(final long handle) throws RocksDBException; - private native int getID(final long handle); - private native ColumnFamilyDescriptor getDescriptor(final long handle) throws RocksDBException; - @Override protected final native void disposeInternal(final long handle); + private static native byte[] getName(final long handle) throws RocksDBException; + private static native int getID(final long handle); + private static native ColumnFamilyDescriptor getDescriptor(final long handle) + throws RocksDBException; + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); private final RocksDB rocksDB_; } diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java index 607a17936e1..bb458078cd6 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java @@ -1342,206 +1342,186 @@ private static long newColumnFamilyOptionsInstance() { private static native long copyColumnFamilyOptions(final long handle); private static native long newColumnFamilyOptionsFromOptions( final long optionsHandle); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); private static native void oldDefaults( final long handle, final int majorVersion, final int minorVersion); - private native void optimizeForSmallDb(final long handle); + private static native void optimizeForSmallDb(final long handle); private static native void optimizeForSmallDb(final long handle, final long cacheHandle); - private native void optimizeForPointLookup(long handle, - long blockCacheSizeMb); - private native void optimizeLevelStyleCompaction(long handle, - long memtableMemoryBudget); - private native void optimizeUniversalStyleCompaction(long handle, - long memtableMemoryBudget); - private native void setComparatorHandle(long handle, int builtinComparator); - private native void setComparatorHandle(long optHandle, - long comparatorHandle, byte comparatorType); - private native void setMergeOperatorName(long handle, String name); - private native void setMergeOperator(long handle, long mergeOperatorHandle); - private native void setCompactionFilterHandle(long handle, - long compactionFilterHandle); - private native void setCompactionFilterFactoryHandle(long handle, - long compactionFilterFactoryHandle); - private native void setWriteBufferSize(long handle, long writeBufferSize) + private static native void optimizeForPointLookup(long handle, long blockCacheSizeMb); + private static native void optimizeLevelStyleCompaction(long handle, long memtableMemoryBudget); + private static native void optimizeUniversalStyleCompaction( + long handle, long memtableMemoryBudget); + private static native void setComparatorHandle(long handle, int builtinComparator); + private static native void setComparatorHandle( + long optHandle, long comparatorHandle, byte comparatorType); + private static native void setMergeOperatorName(long handle, String name); + private static native void setMergeOperator(long handle, long mergeOperatorHandle); + private static native void setCompactionFilterHandle(long handle, long compactionFilterHandle); + private static native void setCompactionFilterFactoryHandle( + long handle, long compactionFilterFactoryHandle); + private static native void setWriteBufferSize(long handle, long writeBufferSize) throws IllegalArgumentException; - private native long writeBufferSize(long handle); - private native void setMaxWriteBufferNumber( - long handle, int maxWriteBufferNumber); - private native int maxWriteBufferNumber(long handle); - private native void setMinWriteBufferNumberToMerge( + private static native long writeBufferSize(long handle); + private static native void setMaxWriteBufferNumber(long handle, int maxWriteBufferNumber); + private static native int maxWriteBufferNumber(long handle); + private static native void setMinWriteBufferNumberToMerge( long handle, int minWriteBufferNumberToMerge); - private native int minWriteBufferNumberToMerge(long handle); - private native void setCompressionType(long handle, byte compressionType); - private native byte compressionType(long handle); - private native void setCompressionPerLevel(long handle, - byte[] compressionLevels); - private native byte[] compressionPerLevel(long handle); - private native void setBottommostCompressionType(long handle, - byte bottommostCompressionType); - private native byte bottommostCompressionType(long handle); - private native void setBottommostCompressionOptions(final long handle, - final long bottommostCompressionOptionsHandle); - private native void setCompressionOptions(long handle, - long compressionOptionsHandle); - private native void useFixedLengthPrefixExtractor( - long handle, int prefixLength); - private native void useCappedPrefixExtractor( - long handle, int prefixLength); - private native void setNumLevels( - long handle, int numLevels); - private native int numLevels(long handle); - private native void setLevelZeroFileNumCompactionTrigger( - long handle, int numFiles); - private native int levelZeroFileNumCompactionTrigger(long handle); - private native void setLevelZeroSlowdownWritesTrigger( - long handle, int numFiles); - private native int levelZeroSlowdownWritesTrigger(long handle); - private native void setLevelZeroStopWritesTrigger( - long handle, int numFiles); - private native int levelZeroStopWritesTrigger(long handle); - private native void setTargetFileSizeBase( - long handle, long targetFileSizeBase); - private native long targetFileSizeBase(long handle); - private native void setTargetFileSizeMultiplier( - long handle, int multiplier); - private native int targetFileSizeMultiplier(long handle); - private native void setMaxBytesForLevelBase( - long handle, long maxBytesForLevelBase); - private native long maxBytesForLevelBase(long handle); - private native void setLevelCompactionDynamicLevelBytes( + private static native int minWriteBufferNumberToMerge(long handle); + private static native void setCompressionType(long handle, byte compressionType); + private static native byte compressionType(long handle); + private static native void setCompressionPerLevel(long handle, byte[] compressionLevels); + private static native byte[] compressionPerLevel(long handle); + private static native void setBottommostCompressionType( + long handle, byte bottommostCompressionType); + private static native byte bottommostCompressionType(long handle); + private static native void setBottommostCompressionOptions( + final long handle, final long bottommostCompressionOptionsHandle); + private static native void setCompressionOptions(long handle, long compressionOptionsHandle); + private static native void useFixedLengthPrefixExtractor(long handle, int prefixLength); + private static native void useCappedPrefixExtractor(long handle, int prefixLength); + private static native void setNumLevels(long handle, int numLevels); + private static native int numLevels(long handle); + private static native void setLevelZeroFileNumCompactionTrigger(long handle, int numFiles); + private static native int levelZeroFileNumCompactionTrigger(long handle); + private static native void setLevelZeroSlowdownWritesTrigger(long handle, int numFiles); + private static native int levelZeroSlowdownWritesTrigger(long handle); + private static native void setLevelZeroStopWritesTrigger(long handle, int numFiles); + private static native int levelZeroStopWritesTrigger(long handle); + private static native void setTargetFileSizeBase(long handle, long targetFileSizeBase); + private static native long targetFileSizeBase(long handle); + private static native void setTargetFileSizeMultiplier(long handle, int multiplier); + private static native int targetFileSizeMultiplier(long handle); + private static native void setMaxBytesForLevelBase(long handle, long maxBytesForLevelBase); + private static native long maxBytesForLevelBase(long handle); + private static native void setLevelCompactionDynamicLevelBytes( long handle, boolean enableLevelCompactionDynamicLevelBytes); - private native boolean levelCompactionDynamicLevelBytes( - long handle); - private native void setMaxBytesForLevelMultiplier(long handle, double multiplier); - private native double maxBytesForLevelMultiplier(long handle); - private native void setMaxCompactionBytes(long handle, long maxCompactionBytes); - private native long maxCompactionBytes(long handle); - private native void setArenaBlockSize( - long handle, long arenaBlockSize) + private static native boolean levelCompactionDynamicLevelBytes(long handle); + private static native void setMaxBytesForLevelMultiplier(long handle, double multiplier); + private static native double maxBytesForLevelMultiplier(long handle); + private static native void setMaxCompactionBytes(long handle, long maxCompactionBytes); + private static native long maxCompactionBytes(long handle); + private static native void setArenaBlockSize(long handle, long arenaBlockSize) throws IllegalArgumentException; - private native long arenaBlockSize(long handle); - private native void setDisableAutoCompactions( - long handle, boolean disableAutoCompactions); - private native boolean disableAutoCompactions(long handle); - private native void setCompactionStyle(long handle, byte compactionStyle); - private native byte compactionStyle(long handle); - private native void setMaxTableFilesSizeFIFO( - long handle, long max_table_files_size); - private native long maxTableFilesSizeFIFO(long handle); - private native void setMaxSequentialSkipInIterations( + private static native long arenaBlockSize(long handle); + private static native void setDisableAutoCompactions(long handle, boolean disableAutoCompactions); + private static native boolean disableAutoCompactions(long handle); + private static native void setCompactionStyle(long handle, byte compactionStyle); + private static native byte compactionStyle(long handle); + private static native void setMaxTableFilesSizeFIFO(long handle, long max_table_files_size); + private static native long maxTableFilesSizeFIFO(long handle); + private static native void setMaxSequentialSkipInIterations( long handle, long maxSequentialSkipInIterations); - private native long maxSequentialSkipInIterations(long handle); - private native void setMemTableFactory(long handle, long factoryHandle); - private native String memTableFactoryName(long handle); - private native void setTableFactory(long handle, long factoryHandle); - private native String tableFactoryName(long handle); + private static native long maxSequentialSkipInIterations(long handle); + private static native void setMemTableFactory(long handle, long factoryHandle); + private static native String memTableFactoryName(long handle); + private static native void setTableFactory(long handle, long factoryHandle); + private static native String tableFactoryName(long handle); private static native void setCfPaths( final long handle, final String[] paths, final long[] targetSizes); private static native long cfPathsLen(final long handle); private static native void cfPaths( final long handle, final String[] paths, final long[] targetSizes); - private native void setInplaceUpdateSupport( - long handle, boolean inplaceUpdateSupport); - private native boolean inplaceUpdateSupport(long handle); - private native void setInplaceUpdateNumLocks( - long handle, long inplaceUpdateNumLocks) + private static native void setInplaceUpdateSupport(long handle, boolean inplaceUpdateSupport); + private static native boolean inplaceUpdateSupport(long handle); + private static native void setInplaceUpdateNumLocks(long handle, long inplaceUpdateNumLocks) throws IllegalArgumentException; - private native long inplaceUpdateNumLocks(long handle); - private native void setMemtablePrefixBloomSizeRatio( + private static native long inplaceUpdateNumLocks(long handle); + private static native void setMemtablePrefixBloomSizeRatio( long handle, double memtablePrefixBloomSizeRatio); - private native double memtablePrefixBloomSizeRatio(long handle); - private native void setExperimentalMempurgeThreshold( + private static native double memtablePrefixBloomSizeRatio(long handle); + private static native void setExperimentalMempurgeThreshold( long handle, double experimentalMempurgeThreshold); - private native double experimentalMempurgeThreshold(long handle); - private native void setMemtableWholeKeyFiltering(long handle, boolean memtableWholeKeyFiltering); - private native boolean memtableWholeKeyFiltering(long handle); - private native void setBloomLocality( - long handle, int bloomLocality); - private native int bloomLocality(long handle); - private native void setMaxSuccessiveMerges( - long handle, long maxSuccessiveMerges) + private static native double experimentalMempurgeThreshold(long handle); + private static native void setMemtableWholeKeyFiltering( + long handle, boolean memtableWholeKeyFiltering); + private static native boolean memtableWholeKeyFiltering(long handle); + private static native void setBloomLocality(long handle, int bloomLocality); + private static native int bloomLocality(long handle); + private static native void setMaxSuccessiveMerges(long handle, long maxSuccessiveMerges) throws IllegalArgumentException; - private native long maxSuccessiveMerges(long handle); - private native void setOptimizeFiltersForHits(long handle, - boolean optimizeFiltersForHits); - private native boolean optimizeFiltersForHits(long handle); - private native void setMemtableHugePageSize(long handle, - long memtableHugePageSize); - private native long memtableHugePageSize(long handle); - private native void setSoftPendingCompactionBytesLimit(long handle, - long softPendingCompactionBytesLimit); - private native long softPendingCompactionBytesLimit(long handle); - private native void setHardPendingCompactionBytesLimit(long handle, - long hardPendingCompactionBytesLimit); - private native long hardPendingCompactionBytesLimit(long handle); - private native void setLevel0FileNumCompactionTrigger(long handle, - int level0FileNumCompactionTrigger); - private native int level0FileNumCompactionTrigger(long handle); - private native void setLevel0SlowdownWritesTrigger(long handle, - int level0SlowdownWritesTrigger); - private native int level0SlowdownWritesTrigger(long handle); - private native void setLevel0StopWritesTrigger(long handle, - int level0StopWritesTrigger); - private native int level0StopWritesTrigger(long handle); - private native void setMaxBytesForLevelMultiplierAdditional(long handle, - int[] maxBytesForLevelMultiplierAdditional); - private native int[] maxBytesForLevelMultiplierAdditional(long handle); - private native void setParanoidFileChecks(long handle, - boolean paranoidFileChecks); - private native boolean paranoidFileChecks(long handle); - private native void setMaxWriteBufferNumberToMaintain(final long handle, - final int maxWriteBufferNumberToMaintain); - private native int maxWriteBufferNumberToMaintain(final long handle); - private native void setCompactionPriority(final long handle, - final byte compactionPriority); - private native byte compactionPriority(final long handle); - private native void setReportBgIoStats(final long handle, - final boolean reportBgIoStats); - private native boolean reportBgIoStats(final long handle); - private native void setTtl(final long handle, final long ttl); - private native long ttl(final long handle); - private native void setPeriodicCompactionSeconds( + private static native long maxSuccessiveMerges(long handle); + private static native void setOptimizeFiltersForHits(long handle, boolean optimizeFiltersForHits); + private static native boolean optimizeFiltersForHits(long handle); + private static native void setMemtableHugePageSize(long handle, long memtableHugePageSize); + private static native long memtableHugePageSize(long handle); + private static native void setSoftPendingCompactionBytesLimit( + long handle, long softPendingCompactionBytesLimit); + private static native long softPendingCompactionBytesLimit(long handle); + private static native void setHardPendingCompactionBytesLimit( + long handle, long hardPendingCompactionBytesLimit); + private static native long hardPendingCompactionBytesLimit(long handle); + private static native void setLevel0FileNumCompactionTrigger( + long handle, int level0FileNumCompactionTrigger); + private static native int level0FileNumCompactionTrigger(long handle); + private static native void setLevel0SlowdownWritesTrigger( + long handle, int level0SlowdownWritesTrigger); + private static native int level0SlowdownWritesTrigger(long handle); + private static native void setLevel0StopWritesTrigger(long handle, int level0StopWritesTrigger); + private static native int level0StopWritesTrigger(long handle); + private static native void setMaxBytesForLevelMultiplierAdditional( + long handle, int[] maxBytesForLevelMultiplierAdditional); + private static native int[] maxBytesForLevelMultiplierAdditional(long handle); + private static native void setParanoidFileChecks(long handle, boolean paranoidFileChecks); + private static native boolean paranoidFileChecks(long handle); + private static native void setMaxWriteBufferNumberToMaintain( + final long handle, final int maxWriteBufferNumberToMaintain); + private static native int maxWriteBufferNumberToMaintain(final long handle); + private static native void setCompactionPriority( + final long handle, final byte compactionPriority); + private static native byte compactionPriority(final long handle); + private static native void setReportBgIoStats(final long handle, final boolean reportBgIoStats); + private static native boolean reportBgIoStats(final long handle); + private static native void setTtl(final long handle, final long ttl); + private static native long ttl(final long handle); + private static native void setPeriodicCompactionSeconds( final long handle, final long periodicCompactionSeconds); - private native long periodicCompactionSeconds(final long handle); - private native void setCompactionOptionsUniversal(final long handle, - final long compactionOptionsUniversalHandle); - private native void setCompactionOptionsFIFO(final long handle, - final long compactionOptionsFIFOHandle); - private native void setForceConsistencyChecks(final long handle, - final boolean forceConsistencyChecks); - private native boolean forceConsistencyChecks(final long handle); - private native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle); + private static native long periodicCompactionSeconds(final long handle); + private static native void setCompactionOptionsUniversal( + final long handle, final long compactionOptionsUniversalHandle); + private static native void setCompactionOptionsFIFO( + final long handle, final long compactionOptionsFIFOHandle); + private static native void setForceConsistencyChecks( + final long handle, final boolean forceConsistencyChecks); + private static native boolean forceConsistencyChecks(final long handle); + private static native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle); private static native void setCompactionThreadLimiter( final long nativeHandle_, final long compactionThreadLimiterHandle); - private native void setMemtableMaxRangeDeletions(final long handle, final int count); - private native int memtableMaxRangeDeletions(final long handle); - private native void setEnableBlobFiles(final long nativeHandle_, final boolean enableBlobFiles); - private native boolean enableBlobFiles(final long nativeHandle_); - private native void setMinBlobSize(final long nativeHandle_, final long minBlobSize); - private native long minBlobSize(final long nativeHandle_); - private native void setBlobFileSize(final long nativeHandle_, final long blobFileSize); - private native long blobFileSize(final long nativeHandle_); - private native void setBlobCompressionType(final long nativeHandle_, final byte compressionType); - private native byte blobCompressionType(final long nativeHandle_); - private native void setEnableBlobGarbageCollection( + private static native void setMemtableMaxRangeDeletions(final long handle, final int count); + private static native int memtableMaxRangeDeletions(final long handle); + + private static native void setEnableBlobFiles( + final long nativeHandle_, final boolean enableBlobFiles); + private static native boolean enableBlobFiles(final long nativeHandle_); + private static native void setMinBlobSize(final long nativeHandle_, final long minBlobSize); + private static native long minBlobSize(final long nativeHandle_); + private static native void setBlobFileSize(final long nativeHandle_, final long blobFileSize); + private static native long blobFileSize(final long nativeHandle_); + private static native void setBlobCompressionType( + final long nativeHandle_, final byte compressionType); + private static native byte blobCompressionType(final long nativeHandle_); + private static native void setEnableBlobGarbageCollection( final long nativeHandle_, final boolean enableBlobGarbageCollection); - private native boolean enableBlobGarbageCollection(final long nativeHandle_); - private native void setBlobGarbageCollectionAgeCutoff( + private static native boolean enableBlobGarbageCollection(final long nativeHandle_); + private static native void setBlobGarbageCollectionAgeCutoff( final long nativeHandle_, final double blobGarbageCollectionAgeCutoff); - private native double blobGarbageCollectionAgeCutoff(final long nativeHandle_); - private native void setBlobGarbageCollectionForceThreshold( + private static native double blobGarbageCollectionAgeCutoff(final long nativeHandle_); + private static native void setBlobGarbageCollectionForceThreshold( final long nativeHandle_, final double blobGarbageCollectionForceThreshold); - private native double blobGarbageCollectionForceThreshold(final long nativeHandle_); - private native void setBlobCompactionReadaheadSize( + private static native double blobGarbageCollectionForceThreshold(final long nativeHandle_); + private static native void setBlobCompactionReadaheadSize( final long nativeHandle_, final long blobCompactionReadaheadSize); - private native long blobCompactionReadaheadSize(final long nativeHandle_); - private native void setBlobFileStartingLevel( + private static native long blobCompactionReadaheadSize(final long nativeHandle_); + private static native void setBlobFileStartingLevel( final long nativeHandle_, final int blobFileStartingLevel); - private native int blobFileStartingLevel(final long nativeHandle_); - private native void setPrepopulateBlobCache( + private static native int blobFileStartingLevel(final long nativeHandle_); + private static native void setPrepopulateBlobCache( final long nativeHandle_, final byte prepopulateBlobCache); - private native byte prepopulateBlobCache(final long nativeHandle_); + private static native byte prepopulateBlobCache(final long nativeHandle_); // instance variables // NOTE: If you add new member variables, please update the copy constructor above! diff --git a/java/src/main/java/org/rocksdb/CompactRangeOptions.java b/java/src/main/java/org/rocksdb/CompactRangeOptions.java index 616a77572d4..ba5fa6455d2 100644 --- a/java/src/main/java/org/rocksdb/CompactRangeOptions.java +++ b/java/src/main/java/org/rocksdb/CompactRangeOptions.java @@ -269,36 +269,35 @@ public boolean canceled() { } private static native long newCompactRangeOptions(); - @Override protected final native void disposeInternal(final long handle); - - private native boolean exclusiveManualCompaction(final long handle); - private native void setExclusiveManualCompaction(final long handle, - final boolean exclusive_manual_compaction); - private native boolean changeLevel(final long handle); - private native void setChangeLevel(final long handle, - final boolean changeLevel); - private native int targetLevel(final long handle); - private native void setTargetLevel(final long handle, - final int targetLevel); - private native int targetPathId(final long handle); - private native void setTargetPathId(final long handle, - final int targetPathId); - private native int bottommostLevelCompaction(final long handle); - private native void setBottommostLevelCompaction(final long handle, - final int bottommostLevelCompaction); - private native boolean allowWriteStall(final long handle); - private native void setAllowWriteStall(final long handle, - final boolean allowWriteStall); - private native void setMaxSubcompactions(final long handle, - final int maxSubcompactions); - private native int maxSubcompactions(final long handle); - - private native void setFullHistoryTSLow( + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + + private static native boolean exclusiveManualCompaction(final long handle); + private static native void setExclusiveManualCompaction( + final long handle, final boolean exclusive_manual_compaction); + private static native boolean changeLevel(final long handle); + private static native void setChangeLevel(final long handle, final boolean changeLevel); + private static native int targetLevel(final long handle); + private static native void setTargetLevel(final long handle, final int targetLevel); + private static native int targetPathId(final long handle); + private static native void setTargetPathId(final long handle, final int targetPathId); + private static native int bottommostLevelCompaction(final long handle); + private static native void setBottommostLevelCompaction( + final long handle, final int bottommostLevelCompaction); + private static native boolean allowWriteStall(final long handle); + private static native void setAllowWriteStall(final long handle, final boolean allowWriteStall); + private static native void setMaxSubcompactions(final long handle, final int maxSubcompactions); + private static native int maxSubcompactions(final long handle); + + private static native void setFullHistoryTSLow( final long handle, final long timestampStart, final long timestampRange); - private native Timestamp fullHistoryTSLow(final long handle); + private static native Timestamp fullHistoryTSLow(final long handle); - private native void setCanceled(final long handle, final boolean canceled); + private static native void setCanceled(final long handle, final boolean canceled); - private native boolean canceled(final long handle); + private static native boolean canceled(final long handle); } diff --git a/java/src/main/java/org/rocksdb/CompactionJobInfo.java b/java/src/main/java/org/rocksdb/CompactionJobInfo.java index cf04bde2493..29369f174a1 100644 --- a/java/src/main/java/org/rocksdb/CompactionJobInfo.java +++ b/java/src/main/java/org/rocksdb/CompactionJobInfo.java @@ -143,7 +143,11 @@ public CompressionType compression() { private static native long newCompactionJobInfo(); - @Override protected native void disposeInternal(final long handle); + @Override + protected void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); private static native byte[] columnFamilyName(final long handle); private static native Status status(final long handle); diff --git a/java/src/main/java/org/rocksdb/CompactionJobStats.java b/java/src/main/java/org/rocksdb/CompactionJobStats.java index 3d53b5565e6..857de7b6243 100644 --- a/java/src/main/java/org/rocksdb/CompactionJobStats.java +++ b/java/src/main/java/org/rocksdb/CompactionJobStats.java @@ -263,8 +263,11 @@ public long numSingleDelMismatch() { } private static native long newCompactionJobStats(); - @Override protected native void disposeInternal(final long handle); - + @Override + protected void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); private static native void reset(final long handle); private static native void add(final long handle, diff --git a/java/src/main/java/org/rocksdb/CompactionOptions.java b/java/src/main/java/org/rocksdb/CompactionOptions.java index 2c7e391fbf7..08cbdf6378b 100644 --- a/java/src/main/java/org/rocksdb/CompactionOptions.java +++ b/java/src/main/java/org/rocksdb/CompactionOptions.java @@ -107,7 +107,11 @@ public CompactionOptions setMaxSubcompactions(final int maxSubcompactions) { } private static native long newCompactionOptions(); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); private static native byte compression(final long handle); private static native void setCompression(final long handle, diff --git a/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java b/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java index 92b21fc50c3..24ebe0da2ff 100644 --- a/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java +++ b/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java @@ -76,12 +76,14 @@ public boolean allowCompaction() { } private static native long newCompactionOptionsFIFO(); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); - private native void setMaxTableFilesSize(final long handle, - final long maxTableFilesSize); - private native long maxTableFilesSize(final long handle); - private native void setAllowCompaction(final long handle, - final boolean allowCompaction); - private native boolean allowCompaction(final long handle); + private static native void setMaxTableFilesSize(final long handle, final long maxTableFilesSize); + private static native long maxTableFilesSize(final long handle); + private static native void setAllowCompaction(final long handle, final boolean allowCompaction); + private static native boolean allowCompaction(final long handle); } diff --git a/java/src/main/java/org/rocksdb/CompactionOptionsUniversal.java b/java/src/main/java/org/rocksdb/CompactionOptionsUniversal.java index 4d2ebdb1f56..f18915b8f56 100644 --- a/java/src/main/java/org/rocksdb/CompactionOptionsUniversal.java +++ b/java/src/main/java/org/rocksdb/CompactionOptionsUniversal.java @@ -248,26 +248,26 @@ public boolean allowTrivialMove() { } private static native long newCompactionOptionsUniversal(); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); - private native void setSizeRatio(final long handle, final int sizeRatio); - private native int sizeRatio(final long handle); - private native void setMinMergeWidth( - final long handle, final int minMergeWidth); - private native int minMergeWidth(final long handle); - private native void setMaxMergeWidth( - final long handle, final int maxMergeWidth); - private native int maxMergeWidth(final long handle); - private native void setMaxSizeAmplificationPercent( + private static native void setSizeRatio(final long handle, final int sizeRatio); + private static native int sizeRatio(final long handle); + private static native void setMinMergeWidth(final long handle, final int minMergeWidth); + private static native int minMergeWidth(final long handle); + private static native void setMaxMergeWidth(final long handle, final int maxMergeWidth); + private static native int maxMergeWidth(final long handle); + private static native void setMaxSizeAmplificationPercent( final long handle, final int maxSizeAmplificationPercent); - private native int maxSizeAmplificationPercent(final long handle); - private native void setCompressionSizePercent( + private static native int maxSizeAmplificationPercent(final long handle); + private static native void setCompressionSizePercent( final long handle, final int compressionSizePercent); - private native int compressionSizePercent(final long handle); - private native void setStopStyle( - final long handle, final byte stopStyle); - private native byte stopStyle(final long handle); - private native void setAllowTrivialMove( - final long handle, final boolean allowTrivialMove); - private native boolean allowTrivialMove(final long handle); + private static native int compressionSizePercent(final long handle); + private static native void setStopStyle(final long handle, final byte stopStyle); + private static native byte stopStyle(final long handle); + private static native void setAllowTrivialMove(final long handle, final boolean allowTrivialMove); + private static native boolean allowTrivialMove(final long handle); } diff --git a/java/src/main/java/org/rocksdb/ComparatorOptions.java b/java/src/main/java/org/rocksdb/ComparatorOptions.java index ee5beb8f6ed..da287b51816 100644 --- a/java/src/main/java/org/rocksdb/ComparatorOptions.java +++ b/java/src/main/java/org/rocksdb/ComparatorOptions.java @@ -120,14 +120,17 @@ public ComparatorOptions setMaxReusedBufferSize(final int maxReusedBufferSize) { } private static native long newComparatorOptions(); - private native byte reusedSynchronisationType(final long handle); - private native void setReusedSynchronisationType(final long handle, - final byte reusedSynchronisationType); - private native boolean useDirectBuffer(final long handle); - private native void setUseDirectBuffer(final long handle, - final boolean useDirectBuffer); - private native int maxReusedBufferSize(final long handle); - private native void setMaxReusedBufferSize(final long handle, - final int maxReuseBufferSize); - @Override protected final native void disposeInternal(final long handle); + private static native byte reusedSynchronisationType(final long handle); + private static native void setReusedSynchronisationType( + final long handle, final byte reusedSynchronisationType); + private static native boolean useDirectBuffer(final long handle); + private static native void setUseDirectBuffer(final long handle, final boolean useDirectBuffer); + private static native int maxReusedBufferSize(final long handle); + private static native void setMaxReusedBufferSize( + final long handle, final int maxReuseBufferSize); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/CompressionOptions.java b/java/src/main/java/org/rocksdb/CompressionOptions.java index 2e1ee57310b..e6316af451e 100644 --- a/java/src/main/java/org/rocksdb/CompressionOptions.java +++ b/java/src/main/java/org/rocksdb/CompressionOptions.java @@ -132,19 +132,22 @@ public boolean enabled() { } private static native long newCompressionOptions(); - @Override protected final native void disposeInternal(final long handle); - - private native void setWindowBits(final long handle, final int windowBits); - private native int windowBits(final long handle); - private native void setLevel(final long handle, final int level); - private native int level(final long handle); - private native void setStrategy(final long handle, final int strategy); - private native int strategy(final long handle); - private native void setMaxDictBytes(final long handle, final int maxDictBytes); - private native int maxDictBytes(final long handle); - private native void setZstdMaxTrainBytes(final long handle, - final int zstdMaxTrainBytes); - private native int zstdMaxTrainBytes(final long handle); - private native void setEnabled(final long handle, final boolean enabled); - private native boolean enabled(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + + private static native void setWindowBits(final long handle, final int windowBits); + private static native int windowBits(final long handle); + private static native void setLevel(final long handle, final int level); + private static native int level(final long handle); + private static native void setStrategy(final long handle, final int strategy); + private static native int strategy(final long handle); + private static native void setMaxDictBytes(final long handle, final int maxDictBytes); + private static native int maxDictBytes(final long handle); + private static native void setZstdMaxTrainBytes(final long handle, final int zstdMaxTrainBytes); + private static native int zstdMaxTrainBytes(final long handle); + private static native void setEnabled(final long handle, final boolean enabled); + private static native boolean enabled(final long handle); } diff --git a/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java b/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java index d28b9060a63..1c496ff2f5c 100644 --- a/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java +++ b/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java @@ -44,5 +44,9 @@ private static native long newConcurrentTaskLimiterImpl0( private static native void resetMaxOutstandingTask(final long handle); private static native int outstandingTask(final long handle); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/ConfigOptions.java b/java/src/main/java/org/rocksdb/ConfigOptions.java index b3b5423c876..4717750b7f4 100644 --- a/java/src/main/java/org/rocksdb/ConfigOptions.java +++ b/java/src/main/java/org/rocksdb/ConfigOptions.java @@ -38,7 +38,12 @@ public ConfigOptions setSanityLevel(final SanityLevel level) { return this; } - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + + private static native void disposeInternalJni(final long handle); private static long newConfigOptionsInstance() { RocksDB.loadLibrary(); diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java index de10c058501..60303dc5ab2 100644 --- a/java/src/main/java/org/rocksdb/DBOptions.java +++ b/java/src/main/java/org/rocksdb/DBOptions.java @@ -213,9 +213,9 @@ public DBOptions setSstFileManager(final SstFileManager sstFileManager) { } @Override - public DBOptions setLogger(final Logger logger) { + public DBOptions setLogger(final LoggerInterface logger) { assert(isOwningHandle()); - setLogger(nativeHandle_, logger.nativeHandle_); + setLogger(nativeHandle_, logger.getNativeHandle(), logger.getLoggerType().getValue()); return this; } @@ -746,21 +746,6 @@ public long dbWriteBufferSize() { return dbWriteBufferSize(nativeHandle_); } - @Override - @Deprecated - public DBOptions setAccessHintOnCompactionStart(final AccessHint accessHint) { - assert(isOwningHandle()); - setAccessHintOnCompactionStart(nativeHandle_, accessHint.getValue()); - return this; - } - - @Override - @Deprecated - public AccessHint accessHintOnCompactionStart() { - assert(isOwningHandle()); - return AccessHint.getAccessHint(accessHintOnCompactionStart(nativeHandle_)); - } - @Override public DBOptions setCompactionReadaheadSize(final long compactionReadaheadSize) { assert(isOwningHandle()); @@ -1256,216 +1241,183 @@ private static long newDBOptionsInstance() { private static native long copyDBOptions(final long handle); private static native long newDBOptionsFromOptions(final long optionsHandle); - @Override protected final native void disposeInternal(final long handle); - - private native void optimizeForSmallDb(final long handle); - private native void setIncreaseParallelism(long handle, int totalThreads); - private native void setCreateIfMissing(long handle, boolean flag); - private native boolean createIfMissing(long handle); - private native void setCreateMissingColumnFamilies( - long handle, boolean flag); - private native boolean createMissingColumnFamilies(long handle); - private native void setEnv(long handle, long envHandle); - private native void setErrorIfExists(long handle, boolean errorIfExists); - private native boolean errorIfExists(long handle); - private native void setParanoidChecks( - long handle, boolean paranoidChecks); - private native boolean paranoidChecks(long handle); - private native void setRateLimiter(long handle, - long rateLimiterHandle); - private native void setSstFileManager(final long handle, - final long sstFileManagerHandle); - private native void setLogger(long handle, - long loggerHandle); - private native void setInfoLogLevel(long handle, byte logLevel); - private native byte infoLogLevel(long handle); - private native void setMaxOpenFiles(long handle, int maxOpenFiles); - private native int maxOpenFiles(long handle); - private native void setMaxFileOpeningThreads(final long handle, - final int maxFileOpeningThreads); - private native int maxFileOpeningThreads(final long handle); - private native void setMaxTotalWalSize(long handle, - long maxTotalWalSize); - private native long maxTotalWalSize(long handle); - private native void setStatistics(final long handle, final long statisticsHandle); - private native long statistics(final long handle); - private native boolean useFsync(long handle); - private native void setUseFsync(long handle, boolean useFsync); - private native void setDbPaths(final long handle, final String[] paths, - final long[] targetSizes); - private native long dbPathsLen(final long handle); - private native void dbPaths(final long handle, final String[] paths, - final long[] targetSizes); - private native void setDbLogDir(long handle, String dbLogDir); - private native String dbLogDir(long handle); - private native void setWalDir(long handle, String walDir); - private native String walDir(long handle); - private native void setDeleteObsoleteFilesPeriodMicros( - long handle, long micros); - private native long deleteObsoleteFilesPeriodMicros(long handle); - private native void setMaxBackgroundCompactions( - long handle, int maxBackgroundCompactions); - private native int maxBackgroundCompactions(long handle); - private native void setMaxSubcompactions(long handle, int maxSubcompactions); - private native int maxSubcompactions(long handle); - private native void setMaxBackgroundFlushes( - long handle, int maxBackgroundFlushes); - private native int maxBackgroundFlushes(long handle); - private native void setMaxBackgroundJobs(long handle, int maxBackgroundJobs); - private native int maxBackgroundJobs(long handle); - private native void setMaxLogFileSize(long handle, long maxLogFileSize) + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + private static native void optimizeForSmallDb(final long handle); + private static native void setIncreaseParallelism(long handle, int totalThreads); + private static native void setCreateIfMissing(long handle, boolean flag); + private static native boolean createIfMissing(long handle); + private static native void setCreateMissingColumnFamilies(long handle, boolean flag); + private static native boolean createMissingColumnFamilies(long handle); + private static native void setEnv(long handle, long envHandle); + private static native void setErrorIfExists(long handle, boolean errorIfExists); + private static native boolean errorIfExists(long handle); + private static native void setParanoidChecks(long handle, boolean paranoidChecks); + private static native boolean paranoidChecks(long handle); + private static native void setRateLimiter(long handle, long rateLimiterHandle); + private static native void setSstFileManager(final long handle, final long sstFileManagerHandle); + private static native void setLogger( + final long handle, final long loggerHandle, final byte loggerType); + private static native void setInfoLogLevel(long handle, byte logLevel); + private static native byte infoLogLevel(long handle); + private static native void setMaxOpenFiles(long handle, int maxOpenFiles); + private static native int maxOpenFiles(long handle); + private static native void setMaxFileOpeningThreads( + final long handle, final int maxFileOpeningThreads); + private static native int maxFileOpeningThreads(final long handle); + private static native void setMaxTotalWalSize(long handle, long maxTotalWalSize); + private static native long maxTotalWalSize(long handle); + private static native void setStatistics(final long handle, final long statisticsHandle); + private static native long statistics(final long handle); + private static native boolean useFsync(long handle); + private static native void setUseFsync(long handle, boolean useFsync); + private static native void setDbPaths( + final long handle, final String[] paths, final long[] targetSizes); + private static native long dbPathsLen(final long handle); + private static native void dbPaths( + final long handle, final String[] paths, final long[] targetSizes); + private static native void setDbLogDir(long handle, String dbLogDir); + private static native String dbLogDir(long handle); + private static native void setWalDir(long handle, String walDir); + private static native String walDir(long handle); + private static native void setDeleteObsoleteFilesPeriodMicros(long handle, long micros); + private static native long deleteObsoleteFilesPeriodMicros(long handle); + private static native void setMaxBackgroundCompactions(long handle, int maxBackgroundCompactions); + private static native int maxBackgroundCompactions(long handle); + private static native void setMaxSubcompactions(long handle, int maxSubcompactions); + private static native int maxSubcompactions(long handle); + private static native void setMaxBackgroundFlushes(long handle, int maxBackgroundFlushes); + private static native int maxBackgroundFlushes(long handle); + private static native void setMaxBackgroundJobs(long handle, int maxBackgroundJobs); + private static native int maxBackgroundJobs(long handle); + private static native void setMaxLogFileSize(long handle, long maxLogFileSize) + throws IllegalArgumentException; + private static native long maxLogFileSize(long handle); + private static native void setLogFileTimeToRoll(long handle, long logFileTimeToRoll) throws IllegalArgumentException; - private native long maxLogFileSize(long handle); - private native void setLogFileTimeToRoll( - long handle, long logFileTimeToRoll) throws IllegalArgumentException; - private native long logFileTimeToRoll(long handle); - private native void setKeepLogFileNum(long handle, long keepLogFileNum) + private static native long logFileTimeToRoll(long handle); + private static native void setKeepLogFileNum(long handle, long keepLogFileNum) throws IllegalArgumentException; - private native long keepLogFileNum(long handle); - private native void setRecycleLogFileNum(long handle, long recycleLogFileNum); - private native long recycleLogFileNum(long handle); - private native void setMaxManifestFileSize( - long handle, long maxManifestFileSize); - private native long maxManifestFileSize(long handle); - private native void setTableCacheNumshardbits( - long handle, int tableCacheNumshardbits); - private native int tableCacheNumshardbits(long handle); - private native void setWalTtlSeconds(long handle, long walTtlSeconds); - private native long walTtlSeconds(long handle); - private native void setWalSizeLimitMB(long handle, long sizeLimitMB); - private native long walSizeLimitMB(long handle); + private static native long keepLogFileNum(long handle); + private static native void setRecycleLogFileNum(long handle, long recycleLogFileNum); + private static native long recycleLogFileNum(long handle); + private static native void setMaxManifestFileSize(long handle, long maxManifestFileSize); + private static native long maxManifestFileSize(long handle); + private static native void setTableCacheNumshardbits(long handle, int tableCacheNumshardbits); + private static native int tableCacheNumshardbits(long handle); + private static native void setWalTtlSeconds(long handle, long walTtlSeconds); + private static native long walTtlSeconds(long handle); + private static native void setWalSizeLimitMB(long handle, long sizeLimitMB); + private static native long walSizeLimitMB(long handle); private static native void setMaxWriteBatchGroupSizeBytes( final long handle, final long maxWriteBatchGroupSizeBytes); private static native long maxWriteBatchGroupSizeBytes(final long handle); - private native void setManifestPreallocationSize( - long handle, long size) throws IllegalArgumentException; - private native long manifestPreallocationSize(long handle); - private native void setUseDirectReads(long handle, boolean useDirectReads); - private native boolean useDirectReads(long handle); - private native void setUseDirectIoForFlushAndCompaction( + private static native void setManifestPreallocationSize(long handle, long size) + throws IllegalArgumentException; + private static native long manifestPreallocationSize(long handle); + private static native void setUseDirectReads(long handle, boolean useDirectReads); + private static native boolean useDirectReads(long handle); + private static native void setUseDirectIoForFlushAndCompaction( long handle, boolean useDirectIoForFlushAndCompaction); - private native boolean useDirectIoForFlushAndCompaction(long handle); - private native void setAllowFAllocate(final long handle, - final boolean allowFAllocate); - private native boolean allowFAllocate(final long handle); - private native void setAllowMmapReads( - long handle, boolean allowMmapReads); - private native boolean allowMmapReads(long handle); - private native void setAllowMmapWrites( - long handle, boolean allowMmapWrites); - private native boolean allowMmapWrites(long handle); - private native void setIsFdCloseOnExec( - long handle, boolean isFdCloseOnExec); - private native boolean isFdCloseOnExec(long handle); - private native void setStatsDumpPeriodSec( - long handle, int statsDumpPeriodSec); - private native int statsDumpPeriodSec(long handle); - private native void setStatsPersistPeriodSec( + private static native boolean useDirectIoForFlushAndCompaction(long handle); + private static native void setAllowFAllocate(final long handle, final boolean allowFAllocate); + private static native boolean allowFAllocate(final long handle); + private static native void setAllowMmapReads(long handle, boolean allowMmapReads); + private static native boolean allowMmapReads(long handle); + private static native void setAllowMmapWrites(long handle, boolean allowMmapWrites); + private static native boolean allowMmapWrites(long handle); + private static native void setIsFdCloseOnExec(long handle, boolean isFdCloseOnExec); + private static native boolean isFdCloseOnExec(long handle); + private static native void setStatsDumpPeriodSec(long handle, int statsDumpPeriodSec); + private static native int statsDumpPeriodSec(long handle); + private static native void setStatsPersistPeriodSec( final long handle, final int statsPersistPeriodSec); - private native int statsPersistPeriodSec( - final long handle); - private native void setStatsHistoryBufferSize( + private static native int statsPersistPeriodSec(final long handle); + private static native void setStatsHistoryBufferSize( final long handle, final long statsHistoryBufferSize); - private native long statsHistoryBufferSize( - final long handle); - private native void setAdviseRandomOnOpen( - long handle, boolean adviseRandomOnOpen); - private native boolean adviseRandomOnOpen(long handle); - private native void setDbWriteBufferSize(final long handle, - final long dbWriteBufferSize); - private native void setWriteBufferManager(final long dbOptionsHandle, - final long writeBufferManagerHandle); - private native long dbWriteBufferSize(final long handle); - private native void setAccessHintOnCompactionStart(final long handle, - final byte accessHintOnCompactionStart); - private native byte accessHintOnCompactionStart(final long handle); - private native void setCompactionReadaheadSize(final long handle, - final long compactionReadaheadSize); - private native long compactionReadaheadSize(final long handle); - private native void setRandomAccessMaxBufferSize(final long handle, - final long randomAccessMaxBufferSize); - private native long randomAccessMaxBufferSize(final long handle); - private native void setWritableFileMaxBufferSize(final long handle, - final long writableFileMaxBufferSize); - private native long writableFileMaxBufferSize(final long handle); - private native void setUseAdaptiveMutex( - long handle, boolean useAdaptiveMutex); - private native boolean useAdaptiveMutex(long handle); - private native void setBytesPerSync( - long handle, long bytesPerSync); - private native long bytesPerSync(long handle); - private native void setWalBytesPerSync(long handle, long walBytesPerSync); - private native long walBytesPerSync(long handle); - private native void setStrictBytesPerSync( + private static native long statsHistoryBufferSize(final long handle); + private static native void setAdviseRandomOnOpen(long handle, boolean adviseRandomOnOpen); + private static native boolean adviseRandomOnOpen(long handle); + private static native void setDbWriteBufferSize(final long handle, final long dbWriteBufferSize); + private static native void setWriteBufferManager( + final long dbOptionsHandle, final long writeBufferManagerHandle); + private static native long dbWriteBufferSize(final long handle); + private static native void setCompactionReadaheadSize( + final long handle, final long compactionReadaheadSize); + private static native long compactionReadaheadSize(final long handle); + private static native void setRandomAccessMaxBufferSize( + final long handle, final long randomAccessMaxBufferSize); + private static native long randomAccessMaxBufferSize(final long handle); + private static native void setWritableFileMaxBufferSize( + final long handle, final long writableFileMaxBufferSize); + private static native long writableFileMaxBufferSize(final long handle); + private static native void setUseAdaptiveMutex(long handle, boolean useAdaptiveMutex); + private static native boolean useAdaptiveMutex(long handle); + private static native void setBytesPerSync(long handle, long bytesPerSync); + private static native long bytesPerSync(long handle); + private static native void setWalBytesPerSync(long handle, long walBytesPerSync); + private static native long walBytesPerSync(long handle); + private static native void setStrictBytesPerSync( final long handle, final boolean strictBytesPerSync); - private native boolean strictBytesPerSync( - final long handle); + private static native boolean strictBytesPerSync(final long handle); private static native void setEventListeners( final long handle, final long[] eventListenerHandles); private static native AbstractEventListener[] eventListeners(final long handle); - private native void setEnableThreadTracking(long handle, - boolean enableThreadTracking); - private native boolean enableThreadTracking(long handle); - private native void setDelayedWriteRate(long handle, long delayedWriteRate); - private native long delayedWriteRate(long handle); - private native void setEnablePipelinedWrite(final long handle, - final boolean enablePipelinedWrite); - private native boolean enablePipelinedWrite(final long handle); - private native void setUnorderedWrite(final long handle, - final boolean unorderedWrite); - private native boolean unorderedWrite(final long handle); - private native void setAllowConcurrentMemtableWrite(long handle, - boolean allowConcurrentMemtableWrite); - private native boolean allowConcurrentMemtableWrite(long handle); - private native void setEnableWriteThreadAdaptiveYield(long handle, - boolean enableWriteThreadAdaptiveYield); - private native boolean enableWriteThreadAdaptiveYield(long handle); - private native void setWriteThreadMaxYieldUsec(long handle, - long writeThreadMaxYieldUsec); - private native long writeThreadMaxYieldUsec(long handle); - private native void setWriteThreadSlowYieldUsec(long handle, - long writeThreadSlowYieldUsec); - private native long writeThreadSlowYieldUsec(long handle); - private native void setSkipStatsUpdateOnDbOpen(final long handle, - final boolean skipStatsUpdateOnDbOpen); - private native boolean skipStatsUpdateOnDbOpen(final long handle); + private static native void setEnableThreadTracking(long handle, boolean enableThreadTracking); + private static native boolean enableThreadTracking(long handle); + private static native void setDelayedWriteRate(long handle, long delayedWriteRate); + private static native long delayedWriteRate(long handle); + private static native void setEnablePipelinedWrite( + final long handle, final boolean enablePipelinedWrite); + private static native boolean enablePipelinedWrite(final long handle); + private static native void setUnorderedWrite(final long handle, final boolean unorderedWrite); + private static native boolean unorderedWrite(final long handle); + private static native void setAllowConcurrentMemtableWrite( + long handle, boolean allowConcurrentMemtableWrite); + private static native boolean allowConcurrentMemtableWrite(long handle); + private static native void setEnableWriteThreadAdaptiveYield( + long handle, boolean enableWriteThreadAdaptiveYield); + private static native boolean enableWriteThreadAdaptiveYield(long handle); + private static native void setWriteThreadMaxYieldUsec(long handle, long writeThreadMaxYieldUsec); + private static native long writeThreadMaxYieldUsec(long handle); + private static native void setWriteThreadSlowYieldUsec( + long handle, long writeThreadSlowYieldUsec); + private static native long writeThreadSlowYieldUsec(long handle); + private static native void setSkipStatsUpdateOnDbOpen( + final long handle, final boolean skipStatsUpdateOnDbOpen); + private static native boolean skipStatsUpdateOnDbOpen(final long handle); private static native void setSkipCheckingSstFileSizesOnDbOpen( final long handle, final boolean skipChecking); private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle); - private native void setWalRecoveryMode(final long handle, - final byte walRecoveryMode); - private native byte walRecoveryMode(final long handle); - private native void setAllow2pc(final long handle, - final boolean allow2pc); - private native boolean allow2pc(final long handle); - private native void setRowCache(final long handle, - final long rowCacheHandle); - private native void setWalFilter(final long handle, - final long walFilterHandle); - private native void setFailIfOptionsFileError(final long handle, - final boolean failIfOptionsFileError); - private native boolean failIfOptionsFileError(final long handle); - private native void setDumpMallocStats(final long handle, - final boolean dumpMallocStats); - private native boolean dumpMallocStats(final long handle); - private native void setAvoidFlushDuringRecovery(final long handle, - final boolean avoidFlushDuringRecovery); - private native boolean avoidFlushDuringRecovery(final long handle); - private native void setAvoidFlushDuringShutdown(final long handle, - final boolean avoidFlushDuringShutdown); - private native boolean avoidFlushDuringShutdown(final long handle); - private native void setAllowIngestBehind(final long handle, - final boolean allowIngestBehind); - private native boolean allowIngestBehind(final long handle); - private native void setTwoWriteQueues(final long handle, - final boolean twoWriteQueues); - private native boolean twoWriteQueues(final long handle); - private native void setManualWalFlush(final long handle, - final boolean manualWalFlush); - private native boolean manualWalFlush(final long handle); - private native void setAtomicFlush(final long handle, - final boolean atomicFlush); - private native boolean atomicFlush(final long handle); + private static native void setWalRecoveryMode(final long handle, final byte walRecoveryMode); + private static native byte walRecoveryMode(final long handle); + private static native void setAllow2pc(final long handle, final boolean allow2pc); + private static native boolean allow2pc(final long handle); + private static native void setRowCache(final long handle, final long rowCacheHandle); + private static native void setWalFilter(final long handle, final long walFilterHandle); + private static native void setFailIfOptionsFileError( + final long handle, final boolean failIfOptionsFileError); + private static native boolean failIfOptionsFileError(final long handle); + private static native void setDumpMallocStats(final long handle, final boolean dumpMallocStats); + private static native boolean dumpMallocStats(final long handle); + private static native void setAvoidFlushDuringRecovery( + final long handle, final boolean avoidFlushDuringRecovery); + private static native boolean avoidFlushDuringRecovery(final long handle); + private static native void setAvoidFlushDuringShutdown( + final long handle, final boolean avoidFlushDuringShutdown); + private static native boolean avoidFlushDuringShutdown(final long handle); + private static native void setAllowIngestBehind( + final long handle, final boolean allowIngestBehind); + private static native boolean allowIngestBehind(final long handle); + private static native void setTwoWriteQueues(final long handle, final boolean twoWriteQueues); + private static native boolean twoWriteQueues(final long handle); + private static native void setManualWalFlush(final long handle, final boolean manualWalFlush); + private static native boolean manualWalFlush(final long handle); + private static native void setAtomicFlush(final long handle, final boolean atomicFlush); + private static native boolean atomicFlush(final long handle); private static native void setAvoidUnnecessaryBlockingIO( final long handle, final boolean avoidBlockingIO); private static native boolean avoidUnnecessaryBlockingIO(final long handle); diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java index 084a399cd03..a3a30dd5ef3 100644 --- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java @@ -185,10 +185,10 @@ public interface DBOptionsInterface> { * *

Default: nullptr

* - * @param logger {@link Logger} instance. + * @param logger {@link LoggerInterface} instance. * @return the instance of the current object. */ - T setLogger(Logger logger); + T setLogger(LoggerInterface logger); /** *

Sets the RocksDB log level. Default level is INFO

@@ -938,28 +938,6 @@ public interface DBOptionsInterface> { */ long dbWriteBufferSize(); - /** - * Specify the file access pattern once a compaction is started. - * It will be applied to all input files of a compaction. - * - * Default: {@link AccessHint#NORMAL} - * - * @param accessHint The access hint - * - * @return the reference to the current options. - */ - @Deprecated T setAccessHintOnCompactionStart(final AccessHint accessHint); - - /** - * Specify the file access pattern once a compaction is started. - * It will be applied to all input files of a compaction. - * - * Default: {@link AccessHint#NORMAL} - * - * @return The access hint - */ - @Deprecated AccessHint accessHintOnCompactionStart(); - /** * This is a maximum buffer size that is used by WinMmapReadableFile in * unbuffered disk I/O mode. We need to maintain an aligned buffer for diff --git a/java/src/main/java/org/rocksdb/DirectSlice.java b/java/src/main/java/org/rocksdb/DirectSlice.java index 5aa0866ffe2..88ec29e3bd6 100644 --- a/java/src/main/java/org/rocksdb/DirectSlice.java +++ b/java/src/main/java/org/rocksdb/DirectSlice.java @@ -126,11 +126,9 @@ protected void disposeInternal() { private static native long createNewDirectSlice0(final ByteBuffer data, final int length); private static native long createNewDirectSlice1(final ByteBuffer data); @Override protected final native ByteBuffer data0(long handle); - private native byte get0(long handle, int offset); - private native void clear0(long handle, boolean internalBuffer, - long internalBufferOffset); - private native void removePrefix0(long handle, int length); - private native void setLength0(long handle, int length); - private native void disposeInternalBuf(final long handle, - long internalBufferOffset); + private static native byte get0(long handle, int offset); + private static native void clear0(long handle, boolean internalBuffer, long internalBufferOffset); + private static native void removePrefix0(long handle, int length); + private static native void setLength0(long handle, int length); + private static native void disposeInternalBuf(final long handle, long internalBufferOffset); } diff --git a/java/src/main/java/org/rocksdb/Env.java b/java/src/main/java/org/rocksdb/Env.java index 6783d815811..83830614268 100644 --- a/java/src/main/java/org/rocksdb/Env.java +++ b/java/src/main/java/org/rocksdb/Env.java @@ -162,18 +162,13 @@ public List getThreadList() throws RocksDBException { } private static native long getDefaultEnvInternal(); - private native void setBackgroundThreads( + private static native void setBackgroundThreads( final long handle, final int number, final byte priority); - private native int getBackgroundThreads(final long handle, - final byte priority); - private native int getThreadPoolQueueLen(final long handle, - final byte priority); - private native void incBackgroundThreadsIfNeeded(final long handle, - final int number, final byte priority); - private native void lowerThreadPoolIOPriority(final long handle, - final byte priority); - private native void lowerThreadPoolCPUPriority(final long handle, - final byte priority); - private native ThreadStatus[] getThreadList(final long handle) - throws RocksDBException; + private static native int getBackgroundThreads(final long handle, final byte priority); + private static native int getThreadPoolQueueLen(final long handle, final byte priority); + private static native void incBackgroundThreadsIfNeeded( + final long handle, final int number, final byte priority); + private static native void lowerThreadPoolIOPriority(final long handle, final byte priority); + private static native void lowerThreadPoolCPUPriority(final long handle, final byte priority); + private static native ThreadStatus[] getThreadList(final long handle) throws RocksDBException; } diff --git a/java/src/main/java/org/rocksdb/EnvOptions.java b/java/src/main/java/org/rocksdb/EnvOptions.java index fd56bc49e52..35bd53849d7 100644 --- a/java/src/main/java/org/rocksdb/EnvOptions.java +++ b/java/src/main/java/org/rocksdb/EnvOptions.java @@ -325,42 +325,38 @@ private static long newEnvOptionsInstance() { } private static native long newEnvOptions(); private static native long newEnvOptions(final long dboptions_handle); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); - private native void setUseMmapReads(final long handle, - final boolean useMmapReads); - private native boolean useMmapReads(final long handle); - private native void setUseMmapWrites(final long handle, - final boolean useMmapWrites); - private native boolean useMmapWrites(final long handle); - private native void setUseDirectReads(final long handle, - final boolean useDirectReads); - private native boolean useDirectReads(final long handle); - private native void setUseDirectWrites(final long handle, - final boolean useDirectWrites); - private native boolean useDirectWrites(final long handle); - private native void setAllowFallocate(final long handle, - final boolean allowFallocate); - private native boolean allowFallocate(final long handle); - private native void setSetFdCloexec(final long handle, - final boolean setFdCloexec); - private native boolean setFdCloexec(final long handle); - private native void setBytesPerSync(final long handle, - final long bytesPerSync); - private native long bytesPerSync(final long handle); - private native void setFallocateWithKeepSize( + private static native void setUseMmapReads(final long handle, final boolean useMmapReads); + private static native boolean useMmapReads(final long handle); + private static native void setUseMmapWrites(final long handle, final boolean useMmapWrites); + private static native boolean useMmapWrites(final long handle); + private static native void setUseDirectReads(final long handle, final boolean useDirectReads); + private static native boolean useDirectReads(final long handle); + private static native void setUseDirectWrites(final long handle, final boolean useDirectWrites); + private static native boolean useDirectWrites(final long handle); + private static native void setAllowFallocate(final long handle, final boolean allowFallocate); + private static native boolean allowFallocate(final long handle); + private static native void setSetFdCloexec(final long handle, final boolean setFdCloexec); + private static native boolean setFdCloexec(final long handle); + private static native void setBytesPerSync(final long handle, final long bytesPerSync); + private static native long bytesPerSync(final long handle); + private static native void setFallocateWithKeepSize( final long handle, final boolean fallocateWithKeepSize); - private native boolean fallocateWithKeepSize(final long handle); - private native void setCompactionReadaheadSize( + private static native boolean fallocateWithKeepSize(final long handle); + private static native void setCompactionReadaheadSize( final long handle, final long compactionReadaheadSize); - private native long compactionReadaheadSize(final long handle); - private native void setRandomAccessMaxBufferSize( + private static native long compactionReadaheadSize(final long handle); + private static native void setRandomAccessMaxBufferSize( final long handle, final long randomAccessMaxBufferSize); - private native long randomAccessMaxBufferSize(final long handle); - private native void setWritableFileMaxBufferSize( + private static native long randomAccessMaxBufferSize(final long handle); + private static native void setWritableFileMaxBufferSize( final long handle, final long writableFileMaxBufferSize); - private native long writableFileMaxBufferSize(final long handle); - private native void setRateLimiter(final long handle, - final long rateLimiterHandle); + private static native long writableFileMaxBufferSize(final long handle); + private static native void setRateLimiter(final long handle, final long rateLimiterHandle); private RateLimiter rateLimiter; } diff --git a/java/src/main/java/org/rocksdb/Filter.java b/java/src/main/java/org/rocksdb/Filter.java index 7f490cf594b..018807c0405 100644 --- a/java/src/main/java/org/rocksdb/Filter.java +++ b/java/src/main/java/org/rocksdb/Filter.java @@ -32,5 +32,9 @@ protected void disposeInternal() { } @Override - protected final native void disposeInternal(final long handle); + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/FlushOptions.java b/java/src/main/java/org/rocksdb/FlushOptions.java index be8c4bc9460..cb723fb7db5 100644 --- a/java/src/main/java/org/rocksdb/FlushOptions.java +++ b/java/src/main/java/org/rocksdb/FlushOptions.java @@ -78,12 +78,15 @@ private static long newFlushOptionsInance() { return newFlushOptions(); } private static native long newFlushOptions(); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + + private static native void disposeInternalJni(final long handle); - private native void setWaitForFlush(final long handle, - final boolean wait); - private native boolean waitForFlush(final long handle); - private native void setAllowWriteStall(final long handle, - final boolean allowWriteStall); - private native boolean allowWriteStall(final long handle); + private static native void setWaitForFlush(final long handle, final boolean wait); + private static native boolean waitForFlush(final long handle); + private static native void setAllowWriteStall(final long handle, final boolean allowWriteStall); + private static native boolean allowWriteStall(final long handle); } diff --git a/java/src/main/java/org/rocksdb/GetStatus.java b/java/src/main/java/org/rocksdb/GetStatus.java new file mode 100644 index 00000000000..a2afafe39eb --- /dev/null +++ b/java/src/main/java/org/rocksdb/GetStatus.java @@ -0,0 +1,32 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + * The result for a fetch + * and the total size of the object fetched. + * If the target of the fetch is not big enough, this may be bigger than the contents of the target. + */ +public class GetStatus { + public final Status status; + public final int requiredSize; + + /** + * Constructor used for success status, when the value is contained in the buffer + * + * @param status the status of the request to fetch into the buffer + * @param requiredSize the size of the data, which may be bigger than the buffer + */ + GetStatus(final Status status, final int requiredSize) { + this.status = status; + this.requiredSize = requiredSize; + } + + static GetStatus fromStatusCode(final Status.Code code, final int requiredSize) { + return new GetStatus(new Status(code, Status.SubCode.getSubCode((byte) 0), null), requiredSize); + } +} diff --git a/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java b/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java index a9868df57d7..cc18b61d226 100644 --- a/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java +++ b/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java @@ -161,9 +161,8 @@ public int thresholdUseSkiplist() { thresholdUseSkiplist_); } - private native long newMemTableFactoryHandle(long bucketCount, - long hugePageTlbSize, int bucketEntriesLoggingThreshold, - boolean ifLogBucketDistWhenFlush, int thresholdUseSkiplist) + private static native long newMemTableFactoryHandle(long bucketCount, long hugePageTlbSize, + int bucketEntriesLoggingThreshold, boolean ifLogBucketDistWhenFlush, int thresholdUseSkiplist) throws IllegalArgumentException; private long bucketCount_; diff --git a/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java b/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java index 80d6b711518..33991f90f72 100644 --- a/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java +++ b/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java @@ -96,9 +96,8 @@ public int branchingFactor() { bucketCount_, height_, branchingFactor_); } - private native long newMemTableFactoryHandle( - long bucketCount, int height, int branchingFactor) - throws IllegalArgumentException; + private static native long newMemTableFactoryHandle( + long bucketCount, int height, int branchingFactor) throws IllegalArgumentException; private long bucketCount_; private int branchingFactor_; diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java index 41fe241ad3a..10d382e7b91 100644 --- a/java/src/main/java/org/rocksdb/HistogramType.java +++ b/java/src/main/java/org/rocksdb/HistogramType.java @@ -13,180 +13,205 @@ public enum HistogramType { COMPACTION_TIME((byte) 0x2), - SUBCOMPACTION_SETUP_TIME((byte) 0x3), + COMPACTION_CPU_TIME((byte) 0x3), - TABLE_SYNC_MICROS((byte) 0x4), + SUBCOMPACTION_SETUP_TIME((byte) 0x4), - COMPACTION_OUTFILE_SYNC_MICROS((byte) 0x5), + TABLE_SYNC_MICROS((byte) 0x5), - WAL_FILE_SYNC_MICROS((byte) 0x6), + COMPACTION_OUTFILE_SYNC_MICROS((byte) 0x6), - MANIFEST_FILE_SYNC_MICROS((byte) 0x7), + WAL_FILE_SYNC_MICROS((byte) 0x7), + + MANIFEST_FILE_SYNC_MICROS((byte) 0x8), /** * TIME SPENT IN IO DURING TABLE OPEN. */ - TABLE_OPEN_IO_MICROS((byte) 0x8), + TABLE_OPEN_IO_MICROS((byte) 0x9), - DB_MULTIGET((byte) 0x9), + DB_MULTIGET((byte) 0xA), - READ_BLOCK_COMPACTION_MICROS((byte) 0xA), + READ_BLOCK_COMPACTION_MICROS((byte) 0xB), - READ_BLOCK_GET_MICROS((byte) 0xB), + READ_BLOCK_GET_MICROS((byte) 0xC), - WRITE_RAW_BLOCK_MICROS((byte) 0xC), + WRITE_RAW_BLOCK_MICROS((byte) 0xD), - NUM_FILES_IN_SINGLE_COMPACTION((byte) 0x12), + NUM_FILES_IN_SINGLE_COMPACTION((byte) 0xE), - DB_SEEK((byte) 0x13), + DB_SEEK((byte) 0xF), - WRITE_STALL((byte) 0x14), + WRITE_STALL((byte) 0x10), - SST_READ_MICROS((byte) 0x15), + SST_READ_MICROS((byte) 0x11), - /** - * The number of subcompactions actually scheduled during a compaction. - */ - NUM_SUBCOMPACTIONS_SCHEDULED((byte) 0x16), + FILE_READ_FLUSH_MICROS((byte) 0x12), - /** - * Value size distribution in each operation. - */ - BYTES_PER_READ((byte) 0x17), - BYTES_PER_WRITE((byte) 0x18), - BYTES_PER_MULTIGET((byte) 0x19), + FILE_READ_COMPACTION_MICROS((byte) 0x13), + + FILE_READ_DB_OPEN_MICROS((byte) 0x14), + + FILE_READ_GET_MICROS((byte) 0x15), + + FILE_READ_MULTIGET_MICROS((byte) 0x16), + + FILE_READ_DB_ITERATOR_MICROS((byte) 0x17), + + FILE_READ_VERIFY_DB_CHECKSUM_MICROS((byte) 0x18), + + FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS((byte) 0x19), + + SST_WRITE_MICROS((byte) 0x1A), + + FILE_WRITE_FLUSH_MICROS((byte) 0x1B), + + FILE_WRITE_COMPACTION_MICROS((byte) 0x1C), + + FILE_WRITE_DB_OPEN_MICROS((byte) 0x1D), /** - * number of bytes compressed. + * The number of subcompactions actually scheduled during a compaction. */ - BYTES_COMPRESSED((byte) 0x1A), + NUM_SUBCOMPACTIONS_SCHEDULED((byte) 0x1E), /** - * number of bytes decompressed. - *

- * number of bytes is when uncompressed; i.e. before/after respectively + * Value size distribution in each operation. */ - BYTES_DECOMPRESSED((byte) 0x1B), + BYTES_PER_READ((byte) 0x1F), + BYTES_PER_WRITE((byte) 0x20), + BYTES_PER_MULTIGET((byte) 0x21), - COMPRESSION_TIMES_NANOS((byte) 0x1C), + COMPRESSION_TIMES_NANOS((byte) 0x22), - DECOMPRESSION_TIMES_NANOS((byte) 0x1D), + DECOMPRESSION_TIMES_NANOS((byte) 0x23), - READ_NUM_MERGE_OPERANDS((byte) 0x1E), - - /** - * Time spent flushing memtable to disk. - */ - FLUSH_TIME((byte) 0x20), + READ_NUM_MERGE_OPERANDS((byte) 0x24), /** * Size of keys written to BlobDB. */ - BLOB_DB_KEY_SIZE((byte) 0x21), + BLOB_DB_KEY_SIZE((byte) 0x25), /** * Size of values written to BlobDB. */ - BLOB_DB_VALUE_SIZE((byte) 0x22), + BLOB_DB_VALUE_SIZE((byte) 0x26), /** * BlobDB Put/PutWithTTL/PutUntil/Write latency. */ - BLOB_DB_WRITE_MICROS((byte) 0x23), + BLOB_DB_WRITE_MICROS((byte) 0x27), /** * BlobDB Get lagency. */ - BLOB_DB_GET_MICROS((byte) 0x24), + BLOB_DB_GET_MICROS((byte) 0x28), /** * BlobDB MultiGet latency. */ - BLOB_DB_MULTIGET_MICROS((byte) 0x25), + BLOB_DB_MULTIGET_MICROS((byte) 0x29), /** * BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency. */ - BLOB_DB_SEEK_MICROS((byte) 0x26), + BLOB_DB_SEEK_MICROS((byte) 0x2A), /** * BlobDB Next latency. */ - BLOB_DB_NEXT_MICROS((byte) 0x27), + BLOB_DB_NEXT_MICROS((byte) 0x2B), /** * BlobDB Prev latency. */ - BLOB_DB_PREV_MICROS((byte) 0x28), + BLOB_DB_PREV_MICROS((byte) 0x2C), /** * Blob file write latency. */ - BLOB_DB_BLOB_FILE_WRITE_MICROS((byte) 0x29), + BLOB_DB_BLOB_FILE_WRITE_MICROS((byte) 0x2D), /** * Blob file read latency. */ - BLOB_DB_BLOB_FILE_READ_MICROS((byte) 0x2A), + BLOB_DB_BLOB_FILE_READ_MICROS((byte) 0x2E), /** * Blob file sync latency. */ - BLOB_DB_BLOB_FILE_SYNC_MICROS((byte) 0x2B), + BLOB_DB_BLOB_FILE_SYNC_MICROS((byte) 0x2F), /** * BlobDB compression time. */ - BLOB_DB_COMPRESSION_MICROS((byte) 0x2D), + BLOB_DB_COMPRESSION_MICROS((byte) 0x30), /** * BlobDB decompression time. */ - BLOB_DB_DECOMPRESSION_MICROS((byte) 0x2E), + BLOB_DB_DECOMPRESSION_MICROS((byte) 0x31), /** - * Num of Index and Filter blocks read from file system per level in MultiGet - * request + * Time spent flushing memtable to disk. */ - NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL((byte) 0x2F), + FLUSH_TIME((byte) 0x32), /** - * Num of SST files read from file system per level in MultiGet request. + * Number of MultiGet batch keys overlapping a file */ - NUM_SST_READ_PER_LEVEL((byte) 0x31), + SST_BATCH_SIZE((byte) 0x33), /** - * The number of retry in auto resume + * Size of a single IO batch issued by MultiGet */ - ERROR_HANDLER_AUTORESUME_RETRY_COUNT((byte) 0x32), - - ASYNC_READ_BYTES((byte) 0x33), + MULTIGET_IO_BATCH_SIZE((byte) 0x34), /** - * Number of bytes read for RocksDB's prefetching contents - * (as opposed to file system's prefetch) - * from the end of SST table during block based table open + * Num of Index and Filter blocks read from file system per level in MultiGet + * request */ - TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x39), + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL((byte) 0x35), - FILE_READ_FLUSH_MICROS((byte) 0x3A), + /** + * Num of SST files read from file system per level in MultiGet request. + */ + NUM_SST_READ_PER_LEVEL((byte) 0x36), - FILE_READ_COMPACTION_MICROS((byte) 0x3B), + /** + * Num of LSM levels read from file system per MultiGet request. + */ + NUM_LEVEL_READ_PER_MULTIGET((byte) 0x37), - FILE_READ_DB_OPEN_MICROS((byte) 0x3C), + /** + * The number of retry in auto resume + */ + ERROR_HANDLER_AUTORESUME_RETRY_COUNT((byte) 0x38), - FILE_READ_GET_MICROS((byte) 0x3D), + ASYNC_READ_BYTES((byte) 0x39), - FILE_READ_MULTIGET_MICROS((byte) 0x3E), + POLL_WAIT_MICROS((byte) 0x3A), - FILE_READ_DB_ITERATOR_MICROS((byte) 0x3F), + /** + * Number of prefetched bytes discarded by RocksDB. + */ + PREFETCHED_BYTES_DISCARDED((byte) 0x3B), - FILE_READ_VERIFY_DB_CHECKSUM_MICROS((byte) 0x40), + /** + * Wait time for aborting async read in FilePrefetchBuffer destructor + */ + ASYNC_PREFETCH_ABORT_MICROS((byte) 0x3C), - FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS((byte) 0x41), + /** + * Number of bytes read for RocksDB's prefetching contents + * (as opposed to file system's prefetch) + * from the end of SST table during block based table open + */ + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x3D), - // 0x1F for backwards compatibility on current minor version. - HISTOGRAM_ENUM_MAX((byte) 0x1F); + // 0x3E for backwards compatibility on current minor version. + HISTOGRAM_ENUM_MAX((byte) 0x3E); private final byte value; diff --git a/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java b/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java index 1a6a5fccd94..aed28131a17 100644 --- a/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java +++ b/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java @@ -205,23 +205,25 @@ public IngestExternalFileOptions setWriteGlobalSeqno( private static native long newIngestExternalFileOptions(final boolean moveFiles, final boolean snapshotConsistency, final boolean allowGlobalSeqNo, final boolean allowBlockingFlush); - @Override protected final native void disposeInternal(final long handle); - - private native boolean moveFiles(final long handle); - private native void setMoveFiles(final long handle, final boolean move_files); - private native boolean snapshotConsistency(final long handle); - private native void setSnapshotConsistency(final long handle, - final boolean snapshotConsistency); - private native boolean allowGlobalSeqNo(final long handle); - private native void setAllowGlobalSeqNo(final long handle, - final boolean allowGloablSeqNo); - private native boolean allowBlockingFlush(final long handle); - private native void setAllowBlockingFlush(final long handle, - final boolean allowBlockingFlush); - private native boolean ingestBehind(final long handle); - private native void setIngestBehind(final long handle, - final boolean ingestBehind); - private native boolean writeGlobalSeqno(final long handle); - private native void setWriteGlobalSeqno(final long handle, - final boolean writeGlobalSeqNo); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + + private static native void disposeInternalJni(final long handle); + + private static native boolean moveFiles(final long handle); + private static native void setMoveFiles(final long handle, final boolean move_files); + private static native boolean snapshotConsistency(final long handle); + private static native void setSnapshotConsistency( + final long handle, final boolean snapshotConsistency); + private static native boolean allowGlobalSeqNo(final long handle); + private static native void setAllowGlobalSeqNo(final long handle, final boolean allowGloablSeqNo); + private static native boolean allowBlockingFlush(final long handle); + private static native void setAllowBlockingFlush( + final long handle, final boolean allowBlockingFlush); + private static native boolean ingestBehind(final long handle); + private static native void setIngestBehind(final long handle, final boolean ingestBehind); + private static native boolean writeGlobalSeqno(final long handle); + private static native void setWriteGlobalSeqno(final long handle, final boolean writeGlobalSeqNo); } diff --git a/java/src/main/java/org/rocksdb/LRUCache.java b/java/src/main/java/org/rocksdb/LRUCache.java index 0a9d02e878e..342b8201fcb 100644 --- a/java/src/main/java/org/rocksdb/LRUCache.java +++ b/java/src/main/java/org/rocksdb/LRUCache.java @@ -102,5 +102,9 @@ public LRUCache(final long capacity, final int numShardBits, final boolean stric private static native long newLRUCache(final long capacity, final int numShardBits, final boolean strictCapacityLimit, final double highPriPoolRatio, final double lowPriPoolRatio); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/LiveFileMetaData.java b/java/src/main/java/org/rocksdb/LiveFileMetaData.java index cb0f1a30225..5242496a315 100644 --- a/java/src/main/java/org/rocksdb/LiveFileMetaData.java +++ b/java/src/main/java/org/rocksdb/LiveFileMetaData.java @@ -16,22 +16,13 @@ public class LiveFileMetaData extends SstFileMetaData { /** * Called from JNI C++ */ - private LiveFileMetaData( - final byte[] columnFamilyName, - final int level, - final String fileName, - final String path, - final long size, - final long smallestSeqno, - final long largestSeqno, - final byte[] smallestKey, - final byte[] largestKey, - final long numReadsSampled, - final boolean beingCompacted, - final long numEntries, - final long numDeletions) { - super(fileName, path, size, smallestSeqno, largestSeqno, smallestKey, - largestKey, numReadsSampled, beingCompacted, numEntries, numDeletions); + private LiveFileMetaData(final byte[] columnFamilyName, final int level, final String fileName, + final String path, final long size, final long smallestSeqno, final long largestSeqno, + final byte[] smallestKey, final byte[] largestKey, final long numReadsSampled, + final boolean beingCompacted, final long numEntries, final long numDeletions, + final byte[] fileChecksum) { + super(fileName, path, size, smallestSeqno, largestSeqno, smallestKey, largestKey, + numReadsSampled, beingCompacted, numEntries, numDeletions, fileChecksum); this.columnFamilyName = columnFamilyName; this.level = level; } diff --git a/java/src/main/java/org/rocksdb/Logger.java b/java/src/main/java/org/rocksdb/Logger.java index 614a7fa502f..b8d0e45efa0 100644 --- a/java/src/main/java/org/rocksdb/Logger.java +++ b/java/src/main/java/org/rocksdb/Logger.java @@ -35,10 +35,7 @@ * {@link org.rocksdb.InfoLogLevel#FATAL_LEVEL}. *

*/ -public abstract class Logger extends RocksCallbackObject { - private static final long WITH_OPTIONS = 0; - private static final long WITH_DBOPTIONS = 1; - +public abstract class Logger extends RocksCallbackObject implements LoggerInterface { /** *

AbstractLogger constructor.

* @@ -47,10 +44,13 @@ public abstract class Logger extends RocksCallbackObject { * maximum log level of RocksDB.

* * @param options {@link org.rocksdb.Options} instance. + * + * @deprecated Use {@link Logger#Logger(InfoLogLevel)} instead, e.g. {@code new + * Logger(options.infoLogLevel())}. */ + @Deprecated public Logger(final Options options) { - super(options.nativeHandle_, WITH_OPTIONS); - + this(options.infoLogLevel()); } /** @@ -61,56 +61,64 @@ public Logger(final Options options) { * as maximum log level of RocksDB.

* * @param dboptions {@link org.rocksdb.DBOptions} instance. + * + * @deprecated Use {@link Logger#Logger(InfoLogLevel)} instead, e.g. {@code new + * Logger(dbOptions.infoLogLevel())}. */ + @Deprecated public Logger(final DBOptions dboptions) { - super(dboptions.nativeHandle_, WITH_DBOPTIONS); + this(dboptions.infoLogLevel()); + } + + /** + *

AbstractLogger constructor.

+ * + * @param logLevel the log level. + */ + public Logger(final InfoLogLevel logLevel) { + super(logLevel.getValue()); } @Override protected long initializeNative(final long... nativeParameterHandles) { - if(nativeParameterHandles[1] == WITH_OPTIONS) { - return createNewLoggerOptions(nativeParameterHandles[0]); - } else if(nativeParameterHandles[1] == WITH_DBOPTIONS) { - return createNewLoggerDbOptions(nativeParameterHandles[0]); + if (nativeParameterHandles.length == 1) { + return newLogger(nativeParameterHandles[0]); } else { throw new IllegalArgumentException(); } } - /** - * Set {@link org.rocksdb.InfoLogLevel} to AbstractLogger. - * - * @param infoLogLevel {@link org.rocksdb.InfoLogLevel} instance. - */ - public void setInfoLogLevel(final InfoLogLevel infoLogLevel) { - setInfoLogLevel(nativeHandle_, infoLogLevel.getValue()); + @Override + public void setInfoLogLevel(final InfoLogLevel logLevel) { + setInfoLogLevel(nativeHandle_, logLevel.getValue()); } - /** - * Return the loggers log level. - * - * @return {@link org.rocksdb.InfoLogLevel} instance. - */ + @Override public InfoLogLevel infoLogLevel() { return InfoLogLevel.getInfoLogLevel( infoLogLevel(nativeHandle_)); } - protected abstract void log(InfoLogLevel infoLogLevel, - String logMsg); + @Override + public long getNativeHandle() { + return nativeHandle_; + } + + @Override + public final LoggerType getLoggerType() { + return LoggerType.JAVA_IMPLEMENTATION; + } + + protected abstract void log(final InfoLogLevel logLevel, final String logMsg); - protected native long createNewLoggerOptions( - long options); - protected native long createNewLoggerDbOptions( - long dbOptions); - protected native void setInfoLogLevel(long handle, - byte infoLogLevel); - protected native byte infoLogLevel(long handle); + protected native long newLogger(final long logLevel); + protected native void setInfoLogLevel(final long handle, final byte logLevel); + protected native byte infoLogLevel(final long handle); /** * We override {@link RocksCallbackObject#disposeInternal()} * as disposing of a rocksdb::LoggerJniCallback requires - * a slightly different approach as it is a std::shared_ptr + * a slightly different approach as it is a std::shared_ptr. */ @Override protected void disposeInternal() { diff --git a/java/src/main/java/org/rocksdb/LoggerInterface.java b/java/src/main/java/org/rocksdb/LoggerInterface.java new file mode 100644 index 00000000000..51239830b94 --- /dev/null +++ b/java/src/main/java/org/rocksdb/LoggerInterface.java @@ -0,0 +1,40 @@ +// Copyright (c) 2016, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + * LoggerInterface is a thin interface that specifies the most basic + * functionality for a Java wrapper around a RocksDB Logger. + */ +public interface LoggerInterface { + /** + * Set the log level. + * + * @param logLevel the level at which to log. + */ + void setInfoLogLevel(final InfoLogLevel logLevel); + + /** + * Get the log level + * + * @return the level at which to log. + */ + InfoLogLevel infoLogLevel(); + + /** + * Get the underlying Native Handle. + * + * @return the native handle. + */ + long getNativeHandle(); + + /** + * Get the type of this logger. + * + * @return the type of this logger. + */ + LoggerType getLoggerType(); +} diff --git a/java/src/main/java/org/rocksdb/LoggerType.java b/java/src/main/java/org/rocksdb/LoggerType.java new file mode 100644 index 00000000000..f5d0b0d954c --- /dev/null +++ b/java/src/main/java/org/rocksdb/LoggerType.java @@ -0,0 +1,48 @@ +// Copyright (c) 2016, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb; + +/** + * Simple enumeration used for differentiating + * the types of loggers when passing via the JNI + * boundary. + */ +public enum LoggerType { + JAVA_IMPLEMENTATION((byte) 0x1), + STDERR_IMPLEMENTATION((byte) 0x2); + + private final byte value; + + LoggerType(final byte value) { + this.value = value; + } + + /** + * Returns the byte value of the enumerations value + * + * @return byte representation + */ + byte getValue() { + return value; + } + + /** + * Get LoggerType by byte value. + * + * @param value byte representation of LoggerType. + * + * @return {@link org.rocksdb.LoggerType} instance. + * @throws java.lang.IllegalArgumentException if an invalid + * value is provided. + */ + static LoggerType getLoggerType(final byte value) { + for (final LoggerType loggerType : LoggerType.values()) { + if (loggerType.getValue() == value) { + return loggerType; + } + } + throw new IllegalArgumentException("Illegal value provided for LoggerType."); + } +} diff --git a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java index 8bf7b0d64be..1521fb4d08a 100644 --- a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java @@ -418,7 +418,7 @@ public interface MutableDBOptionsInterface * That way RocksDB's compaction is doing sequential instead of random reads. *

- * Default: 0 + * Default: 2MB * * @param compactionReadaheadSize The compaction read-ahead size * diff --git a/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java b/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java index b270b8d3669..c769f3fa0bf 100644 --- a/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java +++ b/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java @@ -53,5 +53,5 @@ protected void disposeInternal() { disposeInternal(nativeHandle_); } - private native void disposeInternal(final long handle); + private static native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java b/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java index 80d3c720bf6..4674eae010e 100644 --- a/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java +++ b/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java @@ -5,6 +5,7 @@ package org.rocksdb; +import java.util.Arrays; import java.util.List; /** @@ -45,6 +46,8 @@ public static OptimisticTransactionDB open(final Options options, // the currently-created RocksDB. otdb.storeOptionsInstance(options); + otdb.storeDefaultColumnFamilyHandle(otdb.makeDefaultColumnFamilyHandle()); + return otdb; } @@ -67,7 +70,7 @@ public static OptimisticTransactionDB open(final DBOptions dbOptions, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { - + int defaultColumnFamilyIndex = -1; final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; for (int i = 0; i < columnFamilyDescriptors.size(); i++) { @@ -75,6 +78,13 @@ public static OptimisticTransactionDB open(final DBOptions dbOptions, .get(i); cfNames[i] = cfDescriptor.getName(); cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_; + if (Arrays.equals(cfDescriptor.getName(), RocksDB.DEFAULT_COLUMN_FAMILY)) { + defaultColumnFamilyIndex = i; + } + } + if (defaultColumnFamilyIndex < 0) { + throw new IllegalArgumentException( + "You must provide the default column family in your columnFamilyDescriptors"); } final long[] handles = open(dbOptions.nativeHandle_, path, cfNames, @@ -91,6 +101,9 @@ public static OptimisticTransactionDB open(final DBOptions dbOptions, columnFamilyHandles.add(new ColumnFamilyHandle(otdb, handles[i])); } + otdb.ownedColumnFamilyHandles.addAll(columnFamilyHandles); + otdb.storeDefaultColumnFamilyHandle(columnFamilyHandles.get(defaultColumnFamilyIndex)); + return otdb; } @@ -132,6 +145,12 @@ public void closeE() throws RocksDBException { @SuppressWarnings("PMD.EmptyCatchBlock") @Override public void close() { + for (final ColumnFamilyHandle columnFamilyHandle : // NOPMD - CloseResource + ownedColumnFamilyHandles) { + columnFamilyHandle.close(); + } + ownedColumnFamilyHandles.clear(); + if (owningHandle_.compareAndSet(true, false)) { try { closeDatabase(nativeHandle_); @@ -204,23 +223,24 @@ public RocksDB getBaseDB() { return db; } - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); protected static native long open(final long optionsHandle, final String path) throws RocksDBException; protected static native long[] open(final long handle, final String path, final byte[][] columnFamilyNames, final long[] columnFamilyOptions); private static native void closeDatabase(final long handle) throws RocksDBException; - private native long beginTransaction(final long handle, - final long writeOptionsHandle); - private native long beginTransaction(final long handle, - final long writeOptionsHandle, + private static native long beginTransaction(final long handle, final long writeOptionsHandle); + private static native long beginTransaction(final long handle, final long writeOptionsHandle, final long optimisticTransactionOptionsHandle); - private native long beginTransaction_withOld(final long handle, - final long writeOptionsHandle, final long oldTransactionHandle); - private native long beginTransaction_withOld(final long handle, - final long writeOptionsHandle, - final long optimisticTransactionOptionsHandle, + private static native long beginTransaction_withOld( + final long handle, final long writeOptionsHandle, final long oldTransactionHandle); + private static native long beginTransaction_withOld(final long handle, + final long writeOptionsHandle, final long optimisticTransactionOptionsHandle, final long oldTransactionHandle); - private native long getBaseDB(final long handle); + private static native long getBaseDB(final long handle); } diff --git a/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java b/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java index a2f5d85ab5c..f4111c7b187 100644 --- a/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java +++ b/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java @@ -44,10 +44,12 @@ public OptimisticTransactionOptions setComparator( } private static native long newOptimisticTransactionOptions(); - private native boolean isSetSnapshot(final long handle); - private native void setSetSnapshot(final long handle, - final boolean setSnapshot); - private native void setComparator(final long handle, - final long comparatorHandle); - @Override protected final native void disposeInternal(final long handle); + private static native boolean isSetSnapshot(final long handle); + private static native void setSetSnapshot(final long handle, final boolean setSnapshot); + private static native void setComparator(final long handle, final long comparatorHandle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java index 29f5e8e0d23..d3f6bdea5d9 100644 --- a/java/src/main/java/org/rocksdb/Options.java +++ b/java/src/main/java/org/rocksdb/Options.java @@ -7,6 +7,7 @@ import java.nio.file.Paths; import java.util.*; +import java.util.stream.Collectors; /** * Options to control the behavior of a database. It will be used @@ -833,21 +834,6 @@ public long dbWriteBufferSize() { return dbWriteBufferSize(nativeHandle_); } - @Override - @Deprecated - public Options setAccessHintOnCompactionStart(final AccessHint accessHint) { - assert(isOwningHandle()); - setAccessHintOnCompactionStart(nativeHandle_, accessHint.getValue()); - return this; - } - - @Override - @Deprecated - public AccessHint accessHintOnCompactionStart() { - assert(isOwningHandle()); - return AccessHint.getAccessHint(accessHintOnCompactionStart(nativeHandle_)); - } - @Override public Options setCompactionReadaheadSize(final long compactionReadaheadSize) { assert(isOwningHandle()); @@ -1245,9 +1231,9 @@ public Options setSstFileManager(final SstFileManager sstFileManager) { } @Override - public Options setLogger(final Logger logger) { + public Options setLogger(final LoggerInterface logger) { assert(isOwningHandle()); - setLogger(nativeHandle_, logger.nativeHandle_); + setLogger(nativeHandle_, logger.getNativeHandle(), logger.getLoggerType().getValue()); return this; } @@ -2123,6 +2109,35 @@ public PrepopulateBlobCache prepopulateBlobCache() { // END options for blobs (integrated BlobDB) // + /** + * Return copy of TablePropertiesCollectorFactory list. Modifying this list will not change + * underlying options C++ object. {@link #setTablePropertiesCollectorFactory(List) + * setTablePropertiesCollectorFactory} must be called to propagate changes. All instance must be + * properly closed to prevent memory leaks. + * @return copy of TablePropertiesCollectorFactory list. + */ + public List tablePropertiesCollectorFactory() { + long[] factoryHandlers = tablePropertiesCollectorFactory(nativeHandle_); + + return Arrays.stream(factoryHandlers) + .mapToObj(factoryHandle -> TablePropertiesCollectorFactory.newWrapper(factoryHandle)) + .collect(Collectors.toList()); + } + + /** + * Set TablePropertiesCollectorFactory in underlying C++ object. + * This method create its own copy of the list. Caller is responsible for + * closing all the instances in the list. + * @param factories + */ + public void setTablePropertiesCollectorFactory(List factories) { + long[] factoryHandlers = new long[factories.size()]; + for (int i = 0; i < factoryHandlers.length; i++) { + factoryHandlers[i] = factories.get(i).getNativeHandle(); + } + setTablePropertiesCollectorFactory(nativeHandle_, factoryHandlers); + } + private static long newOptionsInstance() { RocksDB.loadLibrary(); return newOptions(); @@ -2130,391 +2145,330 @@ private static long newOptionsInstance() { private static native long newOptions(); private static native long newOptions(long dbOptHandle, long cfOptHandle); private static native long copyOptions(long handle); - @Override protected final native void disposeInternal(final long handle); - private native void setEnv(long optHandle, long envHandle); - private native void prepareForBulkLoad(long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + private static native void setEnv(long optHandle, long envHandle); + private static native void prepareForBulkLoad(long handle); // DB native handles - private native void setIncreaseParallelism(long handle, int totalThreads); - private native void setCreateIfMissing(long handle, boolean flag); - private native boolean createIfMissing(long handle); - private native void setCreateMissingColumnFamilies( - long handle, boolean flag); - private native boolean createMissingColumnFamilies(long handle); - private native void setErrorIfExists(long handle, boolean errorIfExists); - private native boolean errorIfExists(long handle); - private native void setParanoidChecks( - long handle, boolean paranoidChecks); - private native boolean paranoidChecks(long handle); - private native void setRateLimiter(long handle, - long rateLimiterHandle); - private native void setSstFileManager(final long handle, - final long sstFileManagerHandle); - private native void setLogger(long handle, - long loggerHandle); - private native void setInfoLogLevel(long handle, byte logLevel); - private native byte infoLogLevel(long handle); - private native void setMaxOpenFiles(long handle, int maxOpenFiles); - private native int maxOpenFiles(long handle); - private native void setMaxTotalWalSize(long handle, - long maxTotalWalSize); - private native void setMaxFileOpeningThreads(final long handle, - final int maxFileOpeningThreads); - private native int maxFileOpeningThreads(final long handle); - private native long maxTotalWalSize(long handle); - private native void setStatistics(final long handle, final long statisticsHandle); - private native long statistics(final long handle); - private native boolean useFsync(long handle); - private native void setUseFsync(long handle, boolean useFsync); - private native void setDbPaths(final long handle, final String[] paths, - final long[] targetSizes); - private native long dbPathsLen(final long handle); - private native void dbPaths(final long handle, final String[] paths, - final long[] targetSizes); - private native void setDbLogDir(long handle, String dbLogDir); - private native String dbLogDir(long handle); - private native void setWalDir(long handle, String walDir); - private native String walDir(long handle); - private native void setDeleteObsoleteFilesPeriodMicros( - long handle, long micros); - private native long deleteObsoleteFilesPeriodMicros(long handle); - private native void setMaxBackgroundCompactions( - long handle, int maxBackgroundCompactions); - private native int maxBackgroundCompactions(long handle); - private native void setMaxSubcompactions(long handle, int maxSubcompactions); - private native int maxSubcompactions(long handle); - private native void setMaxBackgroundFlushes( - long handle, int maxBackgroundFlushes); - private native int maxBackgroundFlushes(long handle); - private native void setMaxBackgroundJobs(long handle, int maxMaxBackgroundJobs); - private native int maxBackgroundJobs(long handle); - private native void setMaxLogFileSize(long handle, long maxLogFileSize) + private static native void setIncreaseParallelism(long handle, int totalThreads); + private static native void setCreateIfMissing(long handle, boolean flag); + private static native boolean createIfMissing(long handle); + private static native void setCreateMissingColumnFamilies(long handle, boolean flag); + private static native boolean createMissingColumnFamilies(long handle); + private static native void setErrorIfExists(long handle, boolean errorIfExists); + private static native boolean errorIfExists(long handle); + private static native void setParanoidChecks(long handle, boolean paranoidChecks); + private static native boolean paranoidChecks(long handle); + private static native void setRateLimiter(long handle, long rateLimiterHandle); + private static native void setSstFileManager(final long handle, final long sstFileManagerHandle); + private static native void setLogger( + final long handle, final long loggerHandle, final byte loggerType); + private static native void setInfoLogLevel(long handle, byte logLevel); + private static native byte infoLogLevel(long handle); + private static native void setMaxOpenFiles(long handle, int maxOpenFiles); + private static native int maxOpenFiles(long handle); + private static native void setMaxTotalWalSize(long handle, long maxTotalWalSize); + private static native void setMaxFileOpeningThreads( + final long handle, final int maxFileOpeningThreads); + private static native int maxFileOpeningThreads(final long handle); + private static native long maxTotalWalSize(long handle); + private static native void setStatistics(final long handle, final long statisticsHandle); + private static native long statistics(final long handle); + private static native boolean useFsync(long handle); + private static native void setUseFsync(long handle, boolean useFsync); + private static native void setDbPaths( + final long handle, final String[] paths, final long[] targetSizes); + private static native long dbPathsLen(final long handle); + private static native void dbPaths( + final long handle, final String[] paths, final long[] targetSizes); + private static native void setDbLogDir(long handle, String dbLogDir); + private static native String dbLogDir(long handle); + private static native void setWalDir(long handle, String walDir); + private static native String walDir(long handle); + private static native void setDeleteObsoleteFilesPeriodMicros(long handle, long micros); + private static native long deleteObsoleteFilesPeriodMicros(long handle); + private static native void setMaxBackgroundCompactions(long handle, int maxBackgroundCompactions); + private static native int maxBackgroundCompactions(long handle); + private static native void setMaxSubcompactions(long handle, int maxSubcompactions); + private static native int maxSubcompactions(long handle); + private static native void setMaxBackgroundFlushes(long handle, int maxBackgroundFlushes); + private static native int maxBackgroundFlushes(long handle); + private static native void setMaxBackgroundJobs(long handle, int maxMaxBackgroundJobs); + private static native int maxBackgroundJobs(long handle); + private static native void setMaxLogFileSize(long handle, long maxLogFileSize) throws IllegalArgumentException; - private native long maxLogFileSize(long handle); - private native void setLogFileTimeToRoll( - long handle, long logFileTimeToRoll) throws IllegalArgumentException; - private native long logFileTimeToRoll(long handle); - private native void setKeepLogFileNum(long handle, long keepLogFileNum) + private static native long maxLogFileSize(long handle); + private static native void setLogFileTimeToRoll(long handle, long logFileTimeToRoll) throws IllegalArgumentException; - private native long keepLogFileNum(long handle); - private native void setRecycleLogFileNum(long handle, long recycleLogFileNum); - private native long recycleLogFileNum(long handle); - private native void setMaxManifestFileSize( - long handle, long maxManifestFileSize); - private native long maxManifestFileSize(long handle); - private native void setMaxTableFilesSizeFIFO( - long handle, long maxTableFilesSize); - private native long maxTableFilesSizeFIFO(long handle); - private native void setTableCacheNumshardbits( - long handle, int tableCacheNumshardbits); - private native int tableCacheNumshardbits(long handle); - private native void setWalTtlSeconds(long handle, long walTtlSeconds); - private native long walTtlSeconds(long handle); - private native void setWalSizeLimitMB(long handle, long sizeLimitMB); - private native long walSizeLimitMB(long handle); + private static native long logFileTimeToRoll(long handle); + private static native void setKeepLogFileNum(long handle, long keepLogFileNum) + throws IllegalArgumentException; + private static native long keepLogFileNum(long handle); + private static native void setRecycleLogFileNum(long handle, long recycleLogFileNum); + private static native long recycleLogFileNum(long handle); + private static native void setMaxManifestFileSize(long handle, long maxManifestFileSize); + private static native long maxManifestFileSize(long handle); + private static native void setMaxTableFilesSizeFIFO(long handle, long maxTableFilesSize); + private static native long maxTableFilesSizeFIFO(long handle); + private static native void setTableCacheNumshardbits(long handle, int tableCacheNumshardbits); + private static native int tableCacheNumshardbits(long handle); + private static native void setWalTtlSeconds(long handle, long walTtlSeconds); + private static native long walTtlSeconds(long handle); + private static native void setWalSizeLimitMB(long handle, long sizeLimitMB); + private static native long walSizeLimitMB(long handle); private static native void setMaxWriteBatchGroupSizeBytes( final long handle, final long maxWriteBatchGroupSizeBytes); private static native long maxWriteBatchGroupSizeBytes(final long handle); - private native void setManifestPreallocationSize( - long handle, long size) throws IllegalArgumentException; - private native long manifestPreallocationSize(long handle); - private native void setUseDirectReads(long handle, boolean useDirectReads); - private native boolean useDirectReads(long handle); - private native void setUseDirectIoForFlushAndCompaction( + private static native void setManifestPreallocationSize(long handle, long size) + throws IllegalArgumentException; + private static native long manifestPreallocationSize(long handle); + private static native void setUseDirectReads(long handle, boolean useDirectReads); + private static native boolean useDirectReads(long handle); + private static native void setUseDirectIoForFlushAndCompaction( long handle, boolean useDirectIoForFlushAndCompaction); - private native boolean useDirectIoForFlushAndCompaction(long handle); - private native void setAllowFAllocate(final long handle, - final boolean allowFAllocate); - private native boolean allowFAllocate(final long handle); - private native void setAllowMmapReads( - long handle, boolean allowMmapReads); - private native boolean allowMmapReads(long handle); - private native void setAllowMmapWrites( - long handle, boolean allowMmapWrites); - private native boolean allowMmapWrites(long handle); - private native void setIsFdCloseOnExec( - long handle, boolean isFdCloseOnExec); - private native boolean isFdCloseOnExec(long handle); - private native void setStatsDumpPeriodSec( - long handle, int statsDumpPeriodSec); - private native int statsDumpPeriodSec(long handle); - private native void setStatsPersistPeriodSec( + private static native boolean useDirectIoForFlushAndCompaction(long handle); + private static native void setAllowFAllocate(final long handle, final boolean allowFAllocate); + private static native boolean allowFAllocate(final long handle); + private static native void setAllowMmapReads(long handle, boolean allowMmapReads); + private static native boolean allowMmapReads(long handle); + private static native void setAllowMmapWrites(long handle, boolean allowMmapWrites); + private static native boolean allowMmapWrites(long handle); + private static native void setIsFdCloseOnExec(long handle, boolean isFdCloseOnExec); + private static native boolean isFdCloseOnExec(long handle); + private static native void setStatsDumpPeriodSec(long handle, int statsDumpPeriodSec); + private static native int statsDumpPeriodSec(long handle); + private static native void setStatsPersistPeriodSec( final long handle, final int statsPersistPeriodSec); - private native int statsPersistPeriodSec( - final long handle); - private native void setStatsHistoryBufferSize( + private static native int statsPersistPeriodSec(final long handle); + private static native void setStatsHistoryBufferSize( final long handle, final long statsHistoryBufferSize); - private native long statsHistoryBufferSize( - final long handle); - private native void setAdviseRandomOnOpen( - long handle, boolean adviseRandomOnOpen); - private native boolean adviseRandomOnOpen(long handle); - private native void setDbWriteBufferSize(final long handle, - final long dbWriteBufferSize); - private native void setWriteBufferManager(final long handle, - final long writeBufferManagerHandle); - private native long dbWriteBufferSize(final long handle); - private native void setAccessHintOnCompactionStart(final long handle, - final byte accessHintOnCompactionStart); - private native byte accessHintOnCompactionStart(final long handle); - private native void setCompactionReadaheadSize(final long handle, - final long compactionReadaheadSize); - private native long compactionReadaheadSize(final long handle); - private native void setRandomAccessMaxBufferSize(final long handle, - final long randomAccessMaxBufferSize); - private native long randomAccessMaxBufferSize(final long handle); - private native void setWritableFileMaxBufferSize(final long handle, - final long writableFileMaxBufferSize); - private native long writableFileMaxBufferSize(final long handle); - private native void setUseAdaptiveMutex( - long handle, boolean useAdaptiveMutex); - private native boolean useAdaptiveMutex(long handle); - private native void setBytesPerSync( - long handle, long bytesPerSync); - private native long bytesPerSync(long handle); - private native void setWalBytesPerSync(long handle, long walBytesPerSync); - private native long walBytesPerSync(long handle); - private native void setStrictBytesPerSync( + private static native long statsHistoryBufferSize(final long handle); + private static native void setAdviseRandomOnOpen(long handle, boolean adviseRandomOnOpen); + private static native boolean adviseRandomOnOpen(long handle); + private static native void setDbWriteBufferSize(final long handle, final long dbWriteBufferSize); + private static native void setWriteBufferManager( + final long handle, final long writeBufferManagerHandle); + private static native long dbWriteBufferSize(final long handle); + private static native void setCompactionReadaheadSize( + final long handle, final long compactionReadaheadSize); + private static native long compactionReadaheadSize(final long handle); + private static native void setRandomAccessMaxBufferSize( + final long handle, final long randomAccessMaxBufferSize); + private static native long randomAccessMaxBufferSize(final long handle); + private static native void setWritableFileMaxBufferSize( + final long handle, final long writableFileMaxBufferSize); + private static native long writableFileMaxBufferSize(final long handle); + private static native void setUseAdaptiveMutex(long handle, boolean useAdaptiveMutex); + private static native boolean useAdaptiveMutex(long handle); + private static native void setBytesPerSync(long handle, long bytesPerSync); + private static native long bytesPerSync(long handle); + private static native void setWalBytesPerSync(long handle, long walBytesPerSync); + private static native long walBytesPerSync(long handle); + private static native void setStrictBytesPerSync( final long handle, final boolean strictBytesPerSync); - private native boolean strictBytesPerSync( - final long handle); + private static native boolean strictBytesPerSync(final long handle); private static native void setEventListeners( final long handle, final long[] eventListenerHandles); private static native AbstractEventListener[] eventListeners(final long handle); - private native void setEnableThreadTracking(long handle, - boolean enableThreadTracking); - private native boolean enableThreadTracking(long handle); - private native void setDelayedWriteRate(long handle, long delayedWriteRate); - private native long delayedWriteRate(long handle); - private native void setEnablePipelinedWrite(final long handle, - final boolean pipelinedWrite); - private native boolean enablePipelinedWrite(final long handle); - private native void setUnorderedWrite(final long handle, - final boolean unorderedWrite); - private native boolean unorderedWrite(final long handle); - private native void setAllowConcurrentMemtableWrite(long handle, - boolean allowConcurrentMemtableWrite); - private native boolean allowConcurrentMemtableWrite(long handle); - private native void setEnableWriteThreadAdaptiveYield(long handle, - boolean enableWriteThreadAdaptiveYield); - private native boolean enableWriteThreadAdaptiveYield(long handle); - private native void setWriteThreadMaxYieldUsec(long handle, - long writeThreadMaxYieldUsec); - private native long writeThreadMaxYieldUsec(long handle); - private native void setWriteThreadSlowYieldUsec(long handle, - long writeThreadSlowYieldUsec); - private native long writeThreadSlowYieldUsec(long handle); - private native void setSkipStatsUpdateOnDbOpen(final long handle, - final boolean skipStatsUpdateOnDbOpen); - private native boolean skipStatsUpdateOnDbOpen(final long handle); + private static native void setEnableThreadTracking(long handle, boolean enableThreadTracking); + private static native boolean enableThreadTracking(long handle); + private static native void setDelayedWriteRate(long handle, long delayedWriteRate); + private static native long delayedWriteRate(long handle); + private static native void setEnablePipelinedWrite( + final long handle, final boolean pipelinedWrite); + private static native boolean enablePipelinedWrite(final long handle); + private static native void setUnorderedWrite(final long handle, final boolean unorderedWrite); + private static native boolean unorderedWrite(final long handle); + private static native void setAllowConcurrentMemtableWrite( + long handle, boolean allowConcurrentMemtableWrite); + private static native boolean allowConcurrentMemtableWrite(long handle); + private static native void setEnableWriteThreadAdaptiveYield( + long handle, boolean enableWriteThreadAdaptiveYield); + private static native boolean enableWriteThreadAdaptiveYield(long handle); + private static native void setWriteThreadMaxYieldUsec(long handle, long writeThreadMaxYieldUsec); + private static native long writeThreadMaxYieldUsec(long handle); + private static native void setWriteThreadSlowYieldUsec( + long handle, long writeThreadSlowYieldUsec); + private static native long writeThreadSlowYieldUsec(long handle); + private static native void setSkipStatsUpdateOnDbOpen( + final long handle, final boolean skipStatsUpdateOnDbOpen); + private static native boolean skipStatsUpdateOnDbOpen(final long handle); private static native void setSkipCheckingSstFileSizesOnDbOpen( final long handle, final boolean skipChecking); private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle); - private native void setWalRecoveryMode(final long handle, - final byte walRecoveryMode); - private native byte walRecoveryMode(final long handle); - private native void setAllow2pc(final long handle, - final boolean allow2pc); - private native boolean allow2pc(final long handle); - private native void setRowCache(final long handle, - final long rowCacheHandle); - private native void setWalFilter(final long handle, - final long walFilterHandle); - private native void setFailIfOptionsFileError(final long handle, - final boolean failIfOptionsFileError); - private native boolean failIfOptionsFileError(final long handle); - private native void setDumpMallocStats(final long handle, - final boolean dumpMallocStats); - private native boolean dumpMallocStats(final long handle); - private native void setAvoidFlushDuringRecovery(final long handle, - final boolean avoidFlushDuringRecovery); - private native boolean avoidFlushDuringRecovery(final long handle); - private native void setAvoidFlushDuringShutdown(final long handle, - final boolean avoidFlushDuringShutdown); - private native boolean avoidFlushDuringShutdown(final long handle); - private native void setAllowIngestBehind(final long handle, - final boolean allowIngestBehind); - private native boolean allowIngestBehind(final long handle); - private native void setTwoWriteQueues(final long handle, - final boolean twoWriteQueues); - private native boolean twoWriteQueues(final long handle); - private native void setManualWalFlush(final long handle, - final boolean manualWalFlush); - private native boolean manualWalFlush(final long handle); - + private static native void setWalRecoveryMode(final long handle, final byte walRecoveryMode); + private static native byte walRecoveryMode(final long handle); + private static native void setAllow2pc(final long handle, final boolean allow2pc); + private static native boolean allow2pc(final long handle); + private static native void setRowCache(final long handle, final long rowCacheHandle); + private static native void setWalFilter(final long handle, final long walFilterHandle); + private static native void setFailIfOptionsFileError( + final long handle, final boolean failIfOptionsFileError); + private static native boolean failIfOptionsFileError(final long handle); + private static native void setDumpMallocStats(final long handle, final boolean dumpMallocStats); + private static native boolean dumpMallocStats(final long handle); + private static native void setAvoidFlushDuringRecovery( + final long handle, final boolean avoidFlushDuringRecovery); + private static native boolean avoidFlushDuringRecovery(final long handle); + private static native void setAvoidFlushDuringShutdown( + final long handle, final boolean avoidFlushDuringShutdown); + private static native boolean avoidFlushDuringShutdown(final long handle); + private static native void setAllowIngestBehind( + final long handle, final boolean allowIngestBehind); + private static native boolean allowIngestBehind(final long handle); + private static native void setTwoWriteQueues(final long handle, final boolean twoWriteQueues); + private static native boolean twoWriteQueues(final long handle); + private static native void setManualWalFlush(final long handle, final boolean manualWalFlush); + private static native boolean manualWalFlush(final long handle); // CF native handles private static native void oldDefaults( final long handle, final int majorVersion, final int minorVersion); - private native void optimizeForSmallDb(final long handle); + private static native void optimizeForSmallDb(final long handle); private static native void optimizeForSmallDb(final long handle, final long cacheHandle); - private native void optimizeForPointLookup(long handle, - long blockCacheSizeMb); - private native void optimizeLevelStyleCompaction(long handle, - long memtableMemoryBudget); - private native void optimizeUniversalStyleCompaction(long handle, - long memtableMemoryBudget); - private native void setComparatorHandle(long handle, int builtinComparator); - private native void setComparatorHandle(long optHandle, - long comparatorHandle, byte comparatorType); - private native void setMergeOperatorName( - long handle, String name); - private native void setMergeOperator( - long handle, long mergeOperatorHandle); - private native void setCompactionFilterHandle( - long handle, long compactionFilterHandle); - private native void setCompactionFilterFactoryHandle( - long handle, long compactionFilterFactoryHandle); - private native void setWriteBufferSize(long handle, long writeBufferSize) + private static native void optimizeForPointLookup(long handle, long blockCacheSizeMb); + private static native void optimizeLevelStyleCompaction(long handle, long memtableMemoryBudget); + private static native void optimizeUniversalStyleCompaction( + long handle, long memtableMemoryBudget); + private static native void setComparatorHandle(long handle, int builtinComparator); + private static native void setComparatorHandle( + long optHandle, long comparatorHandle, byte comparatorType); + private static native void setMergeOperatorName(long handle, String name); + private static native void setMergeOperator(long handle, long mergeOperatorHandle); + private static native void setCompactionFilterHandle(long handle, long compactionFilterHandle); + private static native void setCompactionFilterFactoryHandle( + long handle, long compactionFilterFactoryHandle); + private static native void setWriteBufferSize(long handle, long writeBufferSize) throws IllegalArgumentException; - private native long writeBufferSize(long handle); - private native void setMaxWriteBufferNumber( - long handle, int maxWriteBufferNumber); - private native int maxWriteBufferNumber(long handle); - private native void setMinWriteBufferNumberToMerge( + private static native long writeBufferSize(long handle); + private static native void setMaxWriteBufferNumber(long handle, int maxWriteBufferNumber); + private static native int maxWriteBufferNumber(long handle); + private static native void setMinWriteBufferNumberToMerge( long handle, int minWriteBufferNumberToMerge); - private native int minWriteBufferNumberToMerge(long handle); - private native void setCompressionType(long handle, byte compressionType); - private native byte compressionType(long handle); - private native void setCompressionPerLevel(long handle, - byte[] compressionLevels); - private native byte[] compressionPerLevel(long handle); - private native void setBottommostCompressionType(long handle, - byte bottommostCompressionType); - private native byte bottommostCompressionType(long handle); - private native void setBottommostCompressionOptions(final long handle, - final long bottommostCompressionOptionsHandle); - private native void setCompressionOptions(long handle, - long compressionOptionsHandle); - private native void useFixedLengthPrefixExtractor( - long handle, int prefixLength); - private native void useCappedPrefixExtractor( - long handle, int prefixLength); - private native void setNumLevels( - long handle, int numLevels); - private native int numLevels(long handle); - private native void setLevelZeroFileNumCompactionTrigger( - long handle, int numFiles); - private native int levelZeroFileNumCompactionTrigger(long handle); - private native void setLevelZeroSlowdownWritesTrigger( - long handle, int numFiles); - private native int levelZeroSlowdownWritesTrigger(long handle); - private native void setLevelZeroStopWritesTrigger( - long handle, int numFiles); - private native int levelZeroStopWritesTrigger(long handle); - private native void setTargetFileSizeBase( - long handle, long targetFileSizeBase); - private native long targetFileSizeBase(long handle); - private native void setTargetFileSizeMultiplier( - long handle, int multiplier); - private native int targetFileSizeMultiplier(long handle); - private native void setMaxBytesForLevelBase( - long handle, long maxBytesForLevelBase); - private native long maxBytesForLevelBase(long handle); - private native void setLevelCompactionDynamicLevelBytes( + private static native int minWriteBufferNumberToMerge(long handle); + private static native void setCompressionType(long handle, byte compressionType); + private static native byte compressionType(long handle); + private static native void setCompressionPerLevel(long handle, byte[] compressionLevels); + private static native byte[] compressionPerLevel(long handle); + private static native void setBottommostCompressionType( + long handle, byte bottommostCompressionType); + private static native byte bottommostCompressionType(long handle); + private static native void setBottommostCompressionOptions( + final long handle, final long bottommostCompressionOptionsHandle); + private static native void setCompressionOptions(long handle, long compressionOptionsHandle); + private static native void useFixedLengthPrefixExtractor(long handle, int prefixLength); + private static native void useCappedPrefixExtractor(long handle, int prefixLength); + private static native void setNumLevels(long handle, int numLevels); + private static native int numLevels(long handle); + private static native void setLevelZeroFileNumCompactionTrigger(long handle, int numFiles); + private static native int levelZeroFileNumCompactionTrigger(long handle); + private static native void setLevelZeroSlowdownWritesTrigger(long handle, int numFiles); + private static native int levelZeroSlowdownWritesTrigger(long handle); + private static native void setLevelZeroStopWritesTrigger(long handle, int numFiles); + private static native int levelZeroStopWritesTrigger(long handle); + private static native void setTargetFileSizeBase(long handle, long targetFileSizeBase); + private static native long targetFileSizeBase(long handle); + private static native void setTargetFileSizeMultiplier(long handle, int multiplier); + private static native int targetFileSizeMultiplier(long handle); + private static native void setMaxBytesForLevelBase(long handle, long maxBytesForLevelBase); + private static native long maxBytesForLevelBase(long handle); + private static native void setLevelCompactionDynamicLevelBytes( long handle, boolean enableLevelCompactionDynamicLevelBytes); - private native boolean levelCompactionDynamicLevelBytes( - long handle); - private native void setMaxBytesForLevelMultiplier(long handle, double multiplier); - private native double maxBytesForLevelMultiplier(long handle); - private native void setMaxCompactionBytes(long handle, long maxCompactionBytes); - private native long maxCompactionBytes(long handle); - private native void setArenaBlockSize( - long handle, long arenaBlockSize) throws IllegalArgumentException; - private native long arenaBlockSize(long handle); - private native void setDisableAutoCompactions( - long handle, boolean disableAutoCompactions); - private native boolean disableAutoCompactions(long handle); - private native void setCompactionStyle(long handle, byte compactionStyle); - private native byte compactionStyle(long handle); - private native void setMaxSequentialSkipInIterations( + private static native boolean levelCompactionDynamicLevelBytes(long handle); + private static native void setMaxBytesForLevelMultiplier(long handle, double multiplier); + private static native double maxBytesForLevelMultiplier(long handle); + private static native void setMaxCompactionBytes(long handle, long maxCompactionBytes); + private static native long maxCompactionBytes(long handle); + private static native void setArenaBlockSize(long handle, long arenaBlockSize) + throws IllegalArgumentException; + private static native long arenaBlockSize(long handle); + private static native void setDisableAutoCompactions(long handle, boolean disableAutoCompactions); + private static native boolean disableAutoCompactions(long handle); + private static native void setCompactionStyle(long handle, byte compactionStyle); + private static native byte compactionStyle(long handle); + private static native void setMaxSequentialSkipInIterations( long handle, long maxSequentialSkipInIterations); - private native long maxSequentialSkipInIterations(long handle); - private native void setMemTableFactory(long handle, long factoryHandle); - private native String memTableFactoryName(long handle); - private native void setTableFactory(long handle, long factoryHandle); - private native String tableFactoryName(long handle); + private static native long maxSequentialSkipInIterations(long handle); + private static native void setMemTableFactory(long handle, long factoryHandle); + private static native String memTableFactoryName(long handle); + private static native void setTableFactory(long handle, long factoryHandle); + private static native String tableFactoryName(long handle); private static native void setCfPaths( final long handle, final String[] paths, final long[] targetSizes); private static native long cfPathsLen(final long handle); private static native void cfPaths( final long handle, final String[] paths, final long[] targetSizes); - private native void setInplaceUpdateSupport( - long handle, boolean inplaceUpdateSupport); - private native boolean inplaceUpdateSupport(long handle); - private native void setInplaceUpdateNumLocks( - long handle, long inplaceUpdateNumLocks) + private static native void setInplaceUpdateSupport(long handle, boolean inplaceUpdateSupport); + private static native boolean inplaceUpdateSupport(long handle); + private static native void setInplaceUpdateNumLocks(long handle, long inplaceUpdateNumLocks) throws IllegalArgumentException; - private native long inplaceUpdateNumLocks(long handle); - private native void setMemtablePrefixBloomSizeRatio( + private static native long inplaceUpdateNumLocks(long handle); + private static native void setMemtablePrefixBloomSizeRatio( long handle, double memtablePrefixBloomSizeRatio); - private native double memtablePrefixBloomSizeRatio(long handle); - private native void setExperimentalMempurgeThreshold( + private static native double memtablePrefixBloomSizeRatio(long handle); + private static native void setExperimentalMempurgeThreshold( long handle, double experimentalMempurgeThreshold); - private native double experimentalMempurgeThreshold(long handle); - private native void setMemtableWholeKeyFiltering(long handle, boolean memtableWholeKeyFiltering); - private native boolean memtableWholeKeyFiltering(long handle); - private native void setBloomLocality( - long handle, int bloomLocality); - private native int bloomLocality(long handle); - private native void setMaxSuccessiveMerges( - long handle, long maxSuccessiveMerges) + private static native double experimentalMempurgeThreshold(long handle); + private static native void setMemtableWholeKeyFiltering( + long handle, boolean memtableWholeKeyFiltering); + private static native boolean memtableWholeKeyFiltering(long handle); + private static native void setBloomLocality(long handle, int bloomLocality); + private static native int bloomLocality(long handle); + private static native void setMaxSuccessiveMerges(long handle, long maxSuccessiveMerges) throws IllegalArgumentException; - private native long maxSuccessiveMerges(long handle); - private native void setOptimizeFiltersForHits(long handle, - boolean optimizeFiltersForHits); - private native boolean optimizeFiltersForHits(long handle); - private native void setMemtableHugePageSize(long handle, - long memtableHugePageSize); - private native long memtableHugePageSize(long handle); - private native void setSoftPendingCompactionBytesLimit(long handle, - long softPendingCompactionBytesLimit); - private native long softPendingCompactionBytesLimit(long handle); - private native void setHardPendingCompactionBytesLimit(long handle, - long hardPendingCompactionBytesLimit); - private native long hardPendingCompactionBytesLimit(long handle); - private native void setLevel0FileNumCompactionTrigger(long handle, - int level0FileNumCompactionTrigger); - private native int level0FileNumCompactionTrigger(long handle); - private native void setLevel0SlowdownWritesTrigger(long handle, - int level0SlowdownWritesTrigger); - private native int level0SlowdownWritesTrigger(long handle); - private native void setLevel0StopWritesTrigger(long handle, - int level0StopWritesTrigger); - private native int level0StopWritesTrigger(long handle); - private native void setMaxBytesForLevelMultiplierAdditional(long handle, - int[] maxBytesForLevelMultiplierAdditional); - private native int[] maxBytesForLevelMultiplierAdditional(long handle); - private native void setParanoidFileChecks(long handle, - boolean paranoidFileChecks); - private native boolean paranoidFileChecks(long handle); - private native void setMaxWriteBufferNumberToMaintain(final long handle, - final int maxWriteBufferNumberToMaintain); - private native int maxWriteBufferNumberToMaintain(final long handle); - private native void setCompactionPriority(final long handle, - final byte compactionPriority); - private native byte compactionPriority(final long handle); - private native void setReportBgIoStats(final long handle, - final boolean reportBgIoStats); - private native boolean reportBgIoStats(final long handle); - private native void setTtl(final long handle, final long ttl); - private native long ttl(final long handle); - private native void setPeriodicCompactionSeconds( + private static native long maxSuccessiveMerges(long handle); + private static native void setOptimizeFiltersForHits(long handle, boolean optimizeFiltersForHits); + private static native boolean optimizeFiltersForHits(long handle); + private static native void setMemtableHugePageSize(long handle, long memtableHugePageSize); + private static native long memtableHugePageSize(long handle); + private static native void setSoftPendingCompactionBytesLimit( + long handle, long softPendingCompactionBytesLimit); + private static native long softPendingCompactionBytesLimit(long handle); + private static native void setHardPendingCompactionBytesLimit( + long handle, long hardPendingCompactionBytesLimit); + private static native long hardPendingCompactionBytesLimit(long handle); + private static native void setLevel0FileNumCompactionTrigger( + long handle, int level0FileNumCompactionTrigger); + private static native int level0FileNumCompactionTrigger(long handle); + private static native void setLevel0SlowdownWritesTrigger( + long handle, int level0SlowdownWritesTrigger); + private static native int level0SlowdownWritesTrigger(long handle); + private static native void setLevel0StopWritesTrigger(long handle, int level0StopWritesTrigger); + private static native int level0StopWritesTrigger(long handle); + private static native void setMaxBytesForLevelMultiplierAdditional( + long handle, int[] maxBytesForLevelMultiplierAdditional); + private static native int[] maxBytesForLevelMultiplierAdditional(long handle); + private static native void setParanoidFileChecks(long handle, boolean paranoidFileChecks); + private static native boolean paranoidFileChecks(long handle); + private static native void setMaxWriteBufferNumberToMaintain( + final long handle, final int maxWriteBufferNumberToMaintain); + private static native int maxWriteBufferNumberToMaintain(final long handle); + private static native void setCompactionPriority( + final long handle, final byte compactionPriority); + private static native byte compactionPriority(final long handle); + private static native void setReportBgIoStats(final long handle, final boolean reportBgIoStats); + private static native boolean reportBgIoStats(final long handle); + private static native void setTtl(final long handle, final long ttl); + private static native long ttl(final long handle); + private static native void setPeriodicCompactionSeconds( final long handle, final long periodicCompactionSeconds); - private native long periodicCompactionSeconds(final long handle); - private native void setCompactionOptionsUniversal(final long handle, - final long compactionOptionsUniversalHandle); - private native void setCompactionOptionsFIFO(final long handle, - final long compactionOptionsFIFOHandle); - private native void setForceConsistencyChecks(final long handle, - final boolean forceConsistencyChecks); - private native boolean forceConsistencyChecks(final long handle); - private native void setAtomicFlush(final long handle, - final boolean atomicFlush); - private native boolean atomicFlush(final long handle); - private native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle); - private native void setMemtableMaxRangeDeletions(final long handle, final int count); - private native int memtableMaxRangeDeletions(final long handle); + private static native long periodicCompactionSeconds(final long handle); + private static native void setCompactionOptionsUniversal( + final long handle, final long compactionOptionsUniversalHandle); + private static native void setCompactionOptionsFIFO( + final long handle, final long compactionOptionsFIFOHandle); + private static native void setForceConsistencyChecks( + final long handle, final boolean forceConsistencyChecks); + private static native boolean forceConsistencyChecks(final long handle); + private static native void setAtomicFlush(final long handle, final boolean atomicFlush); + private static native boolean atomicFlush(final long handle); + private static native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle); + private static native void setMemtableMaxRangeDeletions(final long handle, final int count); + private static native int memtableMaxRangeDeletions(final long handle); private static native void setCompactionThreadLimiter( final long nativeHandle_, final long newLimiterHandle); private static native void setAvoidUnnecessaryBlockingIO( @@ -2537,32 +2491,38 @@ private static native void setMaxBgErrorResumeCount( private static native void setBgerrorResumeRetryInterval( final long handle, final long bgerrorResumeRetryInterval); private static native long bgerrorResumeRetryInterval(final long handle); - private native void setEnableBlobFiles(final long nativeHandle_, final boolean enableBlobFiles); - private native boolean enableBlobFiles(final long nativeHandle_); - private native void setMinBlobSize(final long nativeHandle_, final long minBlobSize); - private native long minBlobSize(final long nativeHandle_); - private native void setBlobFileSize(final long nativeHandle_, final long blobFileSize); - private native long blobFileSize(final long nativeHandle_); - private native void setBlobCompressionType(final long nativeHandle_, final byte compressionType); - private native byte blobCompressionType(final long nativeHandle_); - private native void setEnableBlobGarbageCollection( + + private static native void setEnableBlobFiles( + final long nativeHandle_, final boolean enableBlobFiles); + private static native boolean enableBlobFiles(final long nativeHandle_); + private static native void setMinBlobSize(final long nativeHandle_, final long minBlobSize); + private static native long minBlobSize(final long nativeHandle_); + private static native void setBlobFileSize(final long nativeHandle_, final long blobFileSize); + private static native long blobFileSize(final long nativeHandle_); + private static native void setBlobCompressionType( + final long nativeHandle_, final byte compressionType); + private static native byte blobCompressionType(final long nativeHandle_); + private static native void setEnableBlobGarbageCollection( final long nativeHandle_, final boolean enableBlobGarbageCollection); - private native boolean enableBlobGarbageCollection(final long nativeHandle_); - private native void setBlobGarbageCollectionAgeCutoff( + private static native boolean enableBlobGarbageCollection(final long nativeHandle_); + private static native void setBlobGarbageCollectionAgeCutoff( final long nativeHandle_, final double blobGarbageCollectionAgeCutoff); - private native double blobGarbageCollectionAgeCutoff(final long nativeHandle_); - private native void setBlobGarbageCollectionForceThreshold( + private static native double blobGarbageCollectionAgeCutoff(final long nativeHandle_); + private static native void setBlobGarbageCollectionForceThreshold( final long nativeHandle_, final double blobGarbageCollectionForceThreshold); - private native double blobGarbageCollectionForceThreshold(final long nativeHandle_); - private native void setBlobCompactionReadaheadSize( + private static native double blobGarbageCollectionForceThreshold(final long nativeHandle_); + private static native void setBlobCompactionReadaheadSize( final long nativeHandle_, final long blobCompactionReadaheadSize); - private native long blobCompactionReadaheadSize(final long nativeHandle_); - private native void setBlobFileStartingLevel( + private static native long blobCompactionReadaheadSize(final long nativeHandle_); + private static native void setBlobFileStartingLevel( final long nativeHandle_, final int blobFileStartingLevel); - private native int blobFileStartingLevel(final long nativeHandle_); - private native void setPrepopulateBlobCache( + private static native int blobFileStartingLevel(final long nativeHandle_); + private static native void setPrepopulateBlobCache( final long nativeHandle_, final byte prepopulateBlobCache); - private native byte prepopulateBlobCache(final long nativeHandle_); + private static native byte prepopulateBlobCache(final long nativeHandle_); + private static native long[] tablePropertiesCollectorFactory(long nativeHandle); + private static native void setTablePropertiesCollectorFactory( + long nativeHandle, long[] factoryHandlers); // instance variables // NOTE: If you add new member variables, please update the copy constructor above! diff --git a/java/src/main/java/org/rocksdb/OptionsUtil.java b/java/src/main/java/org/rocksdb/OptionsUtil.java index 4168921f2a0..642599205d6 100644 --- a/java/src/main/java/org/rocksdb/OptionsUtil.java +++ b/java/src/main/java/org/rocksdb/OptionsUtil.java @@ -110,5 +110,5 @@ private static native void loadOptionsFromFile(long cfgHandle, String optionsFil private static native String getLatestOptionsFileName(String dbPath, long envHandle) throws RocksDBException; - private native static TableFormatConfig readTableFormatConfig(final long nativeHandle_); + private static native TableFormatConfig readTableFormatConfig(final long nativeHandle_); } diff --git a/java/src/main/java/org/rocksdb/PersistentCache.java b/java/src/main/java/org/rocksdb/PersistentCache.java index 5297111e6f9..900e7d1393b 100644 --- a/java/src/main/java/org/rocksdb/PersistentCache.java +++ b/java/src/main/java/org/rocksdb/PersistentCache.java @@ -22,5 +22,9 @@ private static native long newPersistentCache(final long envHandle, final String final long size, final long loggerHandle, final boolean optimizedForNvm) throws RocksDBException; - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/PlainTableConfig.java b/java/src/main/java/org/rocksdb/PlainTableConfig.java index 46077ba5653..1331f5b0a2a 100644 --- a/java/src/main/java/org/rocksdb/PlainTableConfig.java +++ b/java/src/main/java/org/rocksdb/PlainTableConfig.java @@ -234,10 +234,8 @@ public boolean storeIndexInFile() { storeIndexInFile_); } - private native long newTableFactoryHandle( - int keySize, int bloomBitsPerKey, - double hashTableRatio, int indexSparseness, - int hugePageTlbSize, byte encodingType, + private static native long newTableFactoryHandle(int keySize, int bloomBitsPerKey, + double hashTableRatio, int indexSparseness, int hugePageTlbSize, byte encodingType, boolean fullScanMode, boolean storeIndexInFile); private int keySize_; diff --git a/java/src/main/java/org/rocksdb/RateLimiter.java b/java/src/main/java/org/rocksdb/RateLimiter.java index c2b8a0fd92e..003bc61211d 100644 --- a/java/src/main/java/org/rocksdb/RateLimiter.java +++ b/java/src/main/java/org/rocksdb/RateLimiter.java @@ -215,13 +215,16 @@ public long getTotalRequests() { private static native long newRateLimiterHandle(final long rateBytesPerSecond, final long refillPeriodMicros, final int fairness, final byte rateLimiterMode, final boolean autoTune); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); - private native void setBytesPerSecond(final long handle, - final long bytesPerSecond); - private native long getBytesPerSecond(final long handle); - private native void request(final long handle, final long bytes); - private native long getSingleBurstBytes(final long handle); - private native long getTotalBytesThrough(final long handle); - private native long getTotalRequests(final long handle); + private static native void setBytesPerSecond(final long handle, final long bytesPerSecond); + private static native long getBytesPerSecond(final long handle); + private static native void request(final long handle, final long bytes); + private static native long getSingleBurstBytes(final long handle); + private static native long getTotalBytesThrough(final long handle); + private static native long getTotalRequests(final long handle); } diff --git a/java/src/main/java/org/rocksdb/ReadOptions.java b/java/src/main/java/org/rocksdb/ReadOptions.java index 481101fc932..5ce4a8656d3 100644 --- a/java/src/main/java/org/rocksdb/ReadOptions.java +++ b/java/src/main/java/org/rocksdb/ReadOptions.java @@ -573,7 +573,6 @@ public ReadOptions setAutoPrefixMode(final boolean mode) { * @see #iterStartTs() * @return Reference to timestamp or null if there is no timestamp defined. */ - @SuppressWarnings("PMD.ConfusingTernary") public Slice timestamp() { assert (isOwningHandle()); final long timestampSliceHandle = timestamp(nativeHandle_); @@ -623,7 +622,6 @@ public ReadOptions setTimestamp(final AbstractSlice timestamp) { * @return Reference to lower bound timestamp or null if there is no lower bound timestamp * defined. */ - @SuppressWarnings("PMD.ConfusingTernary") public Slice iterStartTs() { assert (isOwningHandle()); final long iterStartTsHandle = iterStartTs(nativeHandle_); @@ -752,6 +750,35 @@ public ReadOptions setValueSizeSoftLimit(final long valueSizeSoftLimit) { return this; } + /** + * If async_io is enabled, RocksDB will prefetch some of data asynchronously. + * RocksDB apply it if reads are sequential and its internal automatic + * prefetching. + *

+ * Default: false + * @return true if async_io is enabled. + */ + @Experimental("Caution: this option is experimental") + public boolean asyncIo() { + assert (isOwningHandle()); + return asyncIo(nativeHandle_); + } + + /** + * If async_io is enabled, RocksDB will prefetch some of data asynchronously. + * RocksDB apply it if reads are sequential and its internal automatic + * prefetching. + *

+ * @param asyncIo async_io enabled or not. + * @return the reference to the current ReadOptions. + */ + @Experimental("Caution: this option is experimental") + public ReadOptions setAsyncIo(final boolean asyncIo) { + assert (isOwningHandle()); + setAsyncIo(nativeHandle_, asyncIo); + return this; + } + // instance variables // NOTE: If you add new member variables, please update the copy constructor above! // @@ -768,55 +795,60 @@ public ReadOptions setValueSizeSoftLimit(final long valueSizeSoftLimit) { private static native long newReadOptions(); private static native long newReadOptions(final boolean verifyChecksums, final boolean fillCache); private static native long copyReadOptions(long handle); - @Override protected final native void disposeInternal(final long handle); - - private native boolean verifyChecksums(long handle); - private native void setVerifyChecksums(long handle, boolean verifyChecksums); - private native boolean fillCache(long handle); - private native void setFillCache(long handle, boolean fillCache); - private native long snapshot(long handle); - private native void setSnapshot(long handle, long snapshotHandle); - private native byte readTier(long handle); - private native void setReadTier(long handle, byte readTierValue); - private native boolean tailing(long handle); - private native void setTailing(long handle, boolean tailing); - private native boolean managed(long handle); - private native void setManaged(long handle, boolean managed); - private native boolean totalOrderSeek(long handle); - private native void setTotalOrderSeek(long handle, boolean totalOrderSeek); - private native boolean prefixSameAsStart(long handle); - private native void setPrefixSameAsStart(long handle, boolean prefixSameAsStart); - private native boolean pinData(long handle); - private native void setPinData(long handle, boolean pinData); - private native boolean backgroundPurgeOnIteratorCleanup(final long handle); - private native void setBackgroundPurgeOnIteratorCleanup(final long handle, - final boolean backgroundPurgeOnIteratorCleanup); - private native long readaheadSize(final long handle); - private native void setReadaheadSize(final long handle, - final long readaheadSize); - private native long maxSkippableInternalKeys(final long handle); - private native void setMaxSkippableInternalKeys(final long handle, - final long maxSkippableInternalKeys); - private native boolean ignoreRangeDeletions(final long handle); - private native void setIgnoreRangeDeletions(final long handle, - final boolean ignoreRangeDeletions); - private native void setIterateUpperBound(final long handle, - final long upperBoundSliceHandle); - private native long iterateUpperBound(final long handle); - private native void setIterateLowerBound(final long handle, - final long lowerBoundSliceHandle); - private native long iterateLowerBound(final long handle); - private native void setTableFilter(final long handle, final long tableFilterHandle); - private native boolean autoPrefixMode(final long handle); - private native void setAutoPrefixMode(final long handle, final boolean autoPrefixMode); - private native long timestamp(final long handle); - private native void setTimestamp(final long handle, final long timestampSliceHandle); - private native long iterStartTs(final long handle); - private native void setIterStartTs(final long handle, final long iterStartTsHandle); - private native long deadline(final long handle); - private native void setDeadline(final long handle, final long deadlineTime); - private native long ioTimeout(final long handle); - private native void setIoTimeout(final long handle, final long ioTimeout); - private native long valueSizeSoftLimit(final long handle); - private native void setValueSizeSoftLimit(final long handle, final long softLimit); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + + private native boolean asyncIo(final long handle); + private native void setAsyncIo(final long handle, final boolean asyncIO); + private static native boolean verifyChecksums(long handle); + private static native void setVerifyChecksums(long handle, boolean verifyChecksums); + private static native boolean fillCache(long handle); + private static native void setFillCache(long handle, boolean fillCache); + private static native long snapshot(long handle); + private static native void setSnapshot(long handle, long snapshotHandle); + private static native byte readTier(long handle); + private static native void setReadTier(long handle, byte readTierValue); + private static native boolean tailing(long handle); + private static native void setTailing(long handle, boolean tailing); + private static native boolean managed(long handle); + private static native void setManaged(long handle, boolean managed); + private static native boolean totalOrderSeek(long handle); + private static native void setTotalOrderSeek(long handle, boolean totalOrderSeek); + private static native boolean prefixSameAsStart(long handle); + private static native void setPrefixSameAsStart(long handle, boolean prefixSameAsStart); + private static native boolean pinData(long handle); + private static native void setPinData(long handle, boolean pinData); + private static native boolean backgroundPurgeOnIteratorCleanup(final long handle); + private static native void setBackgroundPurgeOnIteratorCleanup( + final long handle, final boolean backgroundPurgeOnIteratorCleanup); + private static native long readaheadSize(final long handle); + private static native void setReadaheadSize(final long handle, final long readaheadSize); + private static native long maxSkippableInternalKeys(final long handle); + private static native void setMaxSkippableInternalKeys( + final long handle, final long maxSkippableInternalKeys); + private static native boolean ignoreRangeDeletions(final long handle); + private static native void setIgnoreRangeDeletions( + final long handle, final boolean ignoreRangeDeletions); + private static native void setIterateUpperBound( + final long handle, final long upperBoundSliceHandle); + private static native long iterateUpperBound(final long handle); + private static native void setIterateLowerBound( + final long handle, final long lowerBoundSliceHandle); + private static native long iterateLowerBound(final long handle); + private static native void setTableFilter(final long handle, final long tableFilterHandle); + private static native boolean autoPrefixMode(final long handle); + private static native void setAutoPrefixMode(final long handle, final boolean autoPrefixMode); + private static native long timestamp(final long handle); + private static native void setTimestamp(final long handle, final long timestampSliceHandle); + private static native long iterStartTs(final long handle); + private static native void setIterStartTs(final long handle, final long iterStartTsHandle); + private static native long deadline(final long handle); + private static native void setDeadline(final long handle, final long deadlineTime); + private static native long ioTimeout(final long handle); + private static native void setIoTimeout(final long handle, final long ioTimeout); + private static native long valueSizeSoftLimit(final long handle); + private static native void setValueSizeSoftLimit(final long handle, final long softLimit); } diff --git a/java/src/main/java/org/rocksdb/RestoreOptions.java b/java/src/main/java/org/rocksdb/RestoreOptions.java index a6b43d47606..2ea0e372990 100644 --- a/java/src/main/java/org/rocksdb/RestoreOptions.java +++ b/java/src/main/java/org/rocksdb/RestoreOptions.java @@ -28,5 +28,9 @@ public RestoreOptions(final boolean keepLogFiles) { } private static native long newRestoreOptions(boolean keepLogFiles); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/RocksCallbackObject.java b/java/src/main/java/org/rocksdb/RocksCallbackObject.java index 2c4547b1291..8a7c3713e9b 100644 --- a/java/src/main/java/org/rocksdb/RocksCallbackObject.java +++ b/java/src/main/java/org/rocksdb/RocksCallbackObject.java @@ -69,5 +69,5 @@ protected void disposeInternal() { disposeInternal(nativeHandle_); } - private native void disposeInternal(final long handle); + private static native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java index 54e95e6e8a1..2467d249b05 100644 --- a/java/src/main/java/org/rocksdb/RocksDB.java +++ b/java/src/main/java/org/rocksdb/RocksDB.java @@ -6,6 +6,7 @@ package org.rocksdb; import static java.nio.charset.StandardCharsets.UTF_8; +import static org.rocksdb.util.BufferUtil.CheckBounds; import java.io.IOException; import java.nio.ByteBuffer; @@ -32,14 +33,15 @@ private enum LibraryState { private static final AtomicReference libraryLoaded = new AtomicReference<>(LibraryState.NOT_LOADED); - static { - RocksDB.loadLibrary(); - } - static final String PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD = "Performance optimization for a very specific workload"; - private final List ownedColumnFamilyHandles = new ArrayList<>(); + private static final String BB_ALL_DIRECT_OR_INDIRECT = + "ByteBuffer parameters must all be direct, or must all be indirect"; + private ColumnFamilyHandle defaultColumnFamilyHandle_; + private final ReadOptions defaultReadOptions_ = new ReadOptions(); + + final List ownedColumnFamilyHandles = new ArrayList<>(); /** * Loads the necessary library files. @@ -157,6 +159,10 @@ public static Version rocksdbVersion() { return version; } + public boolean isClosed() { + return !owningHandle_.get(); + } + /** * Private constructor. * @@ -256,6 +262,7 @@ public static RocksDB open(final Options options, final String path) // the currently-created RocksDB. final RocksDB db = new RocksDB(open(options.nativeHandle_, path)); db.storeOptionsInstance(options); + db.storeDefaultColumnFamilyHandle(db.makeDefaultColumnFamilyHandle()); return db; } @@ -297,16 +304,22 @@ public static RocksDB open(final Options options, final String path) */ public static RocksDB open(final DBOptions options, final String path, final List columnFamilyDescriptors, - final List columnFamilyHandles) - throws RocksDBException { - + final List columnFamilyHandles) throws RocksDBException { final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; + int defaultColumnFamilyIndex = -1; for (int i = 0; i < columnFamilyDescriptors.size(); i++) { final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors .get(i); cfNames[i] = cfDescriptor.getName(); cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_; + if (Arrays.equals(cfDescriptor.getName(), RocksDB.DEFAULT_COLUMN_FAMILY)) { + defaultColumnFamilyIndex = i; + } + } + if (defaultColumnFamilyIndex < 0) { + throw new IllegalArgumentException( + "You must provide the default column family in your columnFamilyDescriptors"); } final long[] handles = open(options.nativeHandle_, path, cfNames, @@ -321,7 +334,7 @@ public static RocksDB open(final DBOptions options, final String path, } db.ownedColumnFamilyHandles.addAll(columnFamilyHandles); - + db.storeDefaultColumnFamilyHandle(columnFamilyHandles.get(defaultColumnFamilyIndex)); return db; } @@ -395,6 +408,7 @@ public static RocksDB openReadOnly(final Options options, final String path, // the currently-created RocksDB. final RocksDB db = new RocksDB(openROnly(options.nativeHandle_, path, errorIfWalFileExists)); db.storeOptionsInstance(options); + db.storeDefaultColumnFamilyHandle(db.makeDefaultColumnFamilyHandle()); return db; } @@ -486,11 +500,19 @@ public static RocksDB openReadOnly(final DBOptions options, final String path, final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; + int defaultColumnFamilyIndex = -1; for (int i = 0; i < columnFamilyDescriptors.size(); i++) { final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors .get(i); cfNames[i] = cfDescriptor.getName(); cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_; + if (Arrays.equals(cfDescriptor.getName(), RocksDB.DEFAULT_COLUMN_FAMILY)) { + defaultColumnFamilyIndex = i; + } + } + if (defaultColumnFamilyIndex < 0) { + throw new IllegalArgumentException( + "You must provide the default column family in your columnFamilyDescriptors"); } final long[] handles = @@ -505,6 +527,7 @@ public static RocksDB openReadOnly(final DBOptions options, final String path, } db.ownedColumnFamilyHandles.addAll(columnFamilyHandles); + db.storeDefaultColumnFamilyHandle(columnFamilyHandles.get(defaultColumnFamilyIndex)); return db; } @@ -542,6 +565,7 @@ public static RocksDB openAsSecondary(final Options options, final String path, // the currently-created RocksDB. final RocksDB db = new RocksDB(openAsSecondary(options.nativeHandle_, path, secondaryPath)); db.storeOptionsInstance(options); + db.storeDefaultColumnFamilyHandle(db.makeDefaultColumnFamilyHandle()); return db; } @@ -602,6 +626,7 @@ public static RocksDB openAsSecondary(final DBOptions options, final String path } db.ownedColumnFamilyHandles.addAll(columnFamilyHandles); + db.storeDefaultColumnFamilyHandle(db.makeDefaultColumnFamilyHandle()); return db; } @@ -887,8 +912,8 @@ public void put(final byte[] key, final byte[] value) public void put(final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { - checkBounds(offset, len, key.length); - checkBounds(vOffset, vLen, value.length); + CheckBounds(offset, len, key.length); + CheckBounds(vOffset, vLen, value.length); put(nativeHandle_, key, offset, len, value, vOffset, vLen); } @@ -937,8 +962,8 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { - checkBounds(offset, len, key.length); - checkBounds(vOffset, vLen, value.length); + CheckBounds(offset, len, key.length); + CheckBounds(vOffset, vLen, value.length); put(nativeHandle_, key, offset, len, value, vOffset, vLen, columnFamilyHandle.nativeHandle_); } @@ -982,8 +1007,8 @@ public void put(final WriteOptions writeOpts, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { - checkBounds(offset, len, key.length); - checkBounds(vOffset, vLen, value.length); + CheckBounds(offset, len, key.length); + CheckBounds(vOffset, vLen, value.length); put(nativeHandle_, writeOpts.nativeHandle_, key, offset, len, value, vOffset, vLen); } @@ -1031,9 +1056,18 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, */ public void put(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts, final ByteBuffer key, final ByteBuffer value) throws RocksDBException { - assert key.isDirect() && value.isDirect(); - putDirect(nativeHandle_, writeOpts.nativeHandle_, key, key.position(), key.remaining(), value, - value.position(), value.remaining(), columnFamilyHandle.nativeHandle_); + if (key.isDirect() && value.isDirect()) { + putDirect(nativeHandle_, writeOpts.nativeHandle_, key, key.position(), key.remaining(), value, + value.position(), value.remaining(), columnFamilyHandle.nativeHandle_); + } else if (!key.isDirect() && !value.isDirect()) { + assert key.hasArray(); + assert value.hasArray(); + put(nativeHandle_, writeOpts.nativeHandle_, key.array(), key.arrayOffset() + key.position(), + key.remaining(), value.array(), value.arrayOffset() + value.position(), value.remaining(), + columnFamilyHandle.nativeHandle_); + } else { + throw new RocksDBException(BB_ALL_DIRECT_OR_INDIRECT); + } key.position(key.limit()); value.position(value.limit()); } @@ -1055,9 +1089,18 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions */ public void put(final WriteOptions writeOpts, final ByteBuffer key, final ByteBuffer value) throws RocksDBException { - assert key.isDirect() && value.isDirect(); - putDirect(nativeHandle_, writeOpts.nativeHandle_, key, key.position(), key.remaining(), value, - value.position(), value.remaining(), 0); + if (key.isDirect() && value.isDirect()) { + putDirect(nativeHandle_, writeOpts.nativeHandle_, key, key.position(), key.remaining(), value, + value.position(), value.remaining(), 0); + } else if (!key.isDirect() && !value.isDirect()) { + assert key.hasArray(); + assert value.hasArray(); + put(nativeHandle_, writeOpts.nativeHandle_, key.array(), key.arrayOffset() + key.position(), + key.remaining(), value.array(), value.arrayOffset() + value.position(), + value.remaining()); + } else { + throw new RocksDBException(BB_ALL_DIRECT_OR_INDIRECT); + } key.position(key.limit()); value.position(value.limit()); } @@ -1089,8 +1132,8 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { - checkBounds(offset, len, key.length); - checkBounds(vOffset, vLen, value.length); + CheckBounds(offset, len, key.length); + CheckBounds(vOffset, vLen, value.length); put(nativeHandle_, writeOpts.nativeHandle_, key, offset, len, value, vOffset, vLen, columnFamilyHandle.nativeHandle_); } @@ -1268,9 +1311,18 @@ public void delete(final ColumnFamilyHandle columnFamilyHandle, */ public int get(final ReadOptions opt, final ByteBuffer key, final ByteBuffer value) throws RocksDBException { - assert key.isDirect() && value.isDirect(); - final int result = getDirect(nativeHandle_, opt.nativeHandle_, key, key.position(), - key.remaining(), value, value.position(), value.remaining(), 0); + final int result; + if (key.isDirect() && value.isDirect()) { + result = getDirect(nativeHandle_, opt.nativeHandle_, key, key.position(), key.remaining(), + value, value.position(), value.remaining(), 0); + } else if (!key.isDirect() && !value.isDirect()) { + result = + get(nativeHandle_, opt.nativeHandle_, key.array(), key.arrayOffset() + key.position(), + key.remaining(), value.array(), value.arrayOffset() + value.position(), + value.remaining(), defaultColumnFamilyHandle_.nativeHandle_); + } else { + throw new RocksDBException(BB_ALL_DIRECT_OR_INDIRECT); + } if (result != NOT_FOUND) { value.limit(Math.min(value.limit(), value.position() + result)); } @@ -1556,8 +1608,8 @@ public void merge(final byte[] key, final byte[] value) */ public void merge(final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { - checkBounds(offset, len, key.length); - checkBounds(vOffset, vLen, value.length); + CheckBounds(offset, len, key.length); + CheckBounds(vOffset, vLen, value.length); merge(nativeHandle_, key, offset, len, value, vOffset, vLen); } @@ -1601,8 +1653,8 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle, public void merge(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { - checkBounds(offset, len, key.length); - checkBounds(vOffset, vLen, value.length); + CheckBounds(offset, len, key.length); + CheckBounds(vOffset, vLen, value.length); merge(nativeHandle_, key, offset, len, value, vOffset, vLen, columnFamilyHandle.nativeHandle_); } @@ -1648,12 +1700,48 @@ public void merge(final WriteOptions writeOpts, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { - checkBounds(offset, len, key.length); - checkBounds(vOffset, vLen, value.length); + CheckBounds(offset, len, key.length); + CheckBounds(vOffset, vLen, value.length); merge(nativeHandle_, writeOpts.nativeHandle_, key, offset, len, value, vOffset, vLen); } + public void merge(final WriteOptions writeOpts, final ByteBuffer key, final ByteBuffer value) + throws RocksDBException { + if (key.isDirect() && value.isDirect()) { + mergeDirect(nativeHandle_, writeOpts.nativeHandle_, key, key.position(), key.remaining(), + value, value.position(), value.remaining(), 0); + } else if (!key.isDirect() && !value.isDirect()) { + assert key.hasArray(); + assert value.hasArray(); + merge(nativeHandle_, writeOpts.nativeHandle_, key.array(), key.arrayOffset() + key.position(), + key.remaining(), value.array(), value.arrayOffset() + value.position(), + value.remaining()); + } else { + throw new RocksDBException(BB_ALL_DIRECT_OR_INDIRECT); + } + key.position(key.limit()); + value.position(value.limit()); + } + + public void merge(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts, + final ByteBuffer key, final ByteBuffer value) throws RocksDBException { + if (key.isDirect() && value.isDirect()) { + mergeDirect(nativeHandle_, writeOpts.nativeHandle_, key, key.position(), key.remaining(), + value, value.position(), value.remaining(), columnFamilyHandle.nativeHandle_); + } else if (!key.isDirect() && !value.isDirect()) { + assert key.hasArray(); + assert value.hasArray(); + merge(nativeHandle_, writeOpts.nativeHandle_, key.array(), key.arrayOffset() + key.position(), + key.remaining(), value.array(), value.arrayOffset() + value.position(), value.remaining(), + columnFamilyHandle.nativeHandle_); + } else { + throw new RocksDBException(BB_ALL_DIRECT_OR_INDIRECT); + } + key.position(key.limit()); + value.position(value.limit()); + } + /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" @@ -1740,8 +1828,8 @@ public void merge( final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { - checkBounds(offset, len, key.length); - checkBounds(vOffset, vLen, value.length); + CheckBounds(offset, len, key.length); + CheckBounds(vOffset, vLen, value.length); merge(nativeHandle_, writeOpts.nativeHandle_, key, offset, len, value, vOffset, vLen, columnFamilyHandle.nativeHandle_); @@ -1825,8 +1913,8 @@ public int get(final byte[] key, final byte[] value) throws RocksDBException { public int get(final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { - checkBounds(offset, len, key.length); - checkBounds(vOffset, vLen, value.length); + CheckBounds(offset, len, key.length); + CheckBounds(vOffset, vLen, value.length); return get(nativeHandle_, key, offset, len, value, vOffset, vLen); } @@ -1882,8 +1970,8 @@ public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException, IllegalArgumentException { - checkBounds(offset, len, key.length); - checkBounds(vOffset, vLen, value.length); + CheckBounds(offset, len, key.length); + CheckBounds(vOffset, vLen, value.length); return get(nativeHandle_, key, offset, len, value, vOffset, vLen, columnFamilyHandle.nativeHandle_); } @@ -1937,8 +2025,8 @@ public int get(final ReadOptions opt, final byte[] key, public int get(final ReadOptions opt, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { - checkBounds(offset, len, key.length); - checkBounds(vOffset, vLen, value.length); + CheckBounds(offset, len, key.length); + CheckBounds(vOffset, vLen, value.length); return get(nativeHandle_, opt.nativeHandle_, key, offset, len, value, vOffset, vLen); } @@ -1998,8 +2086,8 @@ public int get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions opt, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { - checkBounds(offset, len, key.length); - checkBounds(vOffset, vLen, value.length); + CheckBounds(offset, len, key.length); + CheckBounds(vOffset, vLen, value.length); return get(nativeHandle_, opt.nativeHandle_, key, offset, len, value, vOffset, vLen, columnFamilyHandle.nativeHandle_); } @@ -2038,7 +2126,7 @@ public byte[] get(final byte[] key) throws RocksDBException { */ public byte[] get(final byte[] key, final int offset, final int len) throws RocksDBException { - checkBounds(offset, len, key.length); + CheckBounds(offset, len, key.length); return get(nativeHandle_, key, offset, len); } @@ -2083,7 +2171,7 @@ public byte[] get(final ColumnFamilyHandle columnFamilyHandle, public byte[] get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final int offset, final int len) throws RocksDBException { - checkBounds(offset, len, key.length); + CheckBounds(offset, len, key.length); return get(nativeHandle_, key, offset, len, columnFamilyHandle.nativeHandle_); } @@ -2125,7 +2213,7 @@ public byte[] get(final ReadOptions opt, final byte[] key) */ public byte[] get(final ReadOptions opt, final byte[] key, final int offset, final int len) throws RocksDBException { - checkBounds(offset, len, key.length); + CheckBounds(offset, len, key.length); return get(nativeHandle_, opt.nativeHandle_, key, offset, len); } @@ -2172,7 +2260,7 @@ public byte[] get(final ColumnFamilyHandle columnFamilyHandle, public byte[] get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions opt, final byte[] key, final int offset, final int len) throws RocksDBException { - checkBounds(offset, len, key.length); + CheckBounds(offset, len, key.length); return get(nativeHandle_, opt.nativeHandle_, key, offset, len, columnFamilyHandle.nativeHandle_); } @@ -2227,7 +2315,7 @@ public List multiGetAsList( final List keys) throws RocksDBException, IllegalArgumentException { assert (!keys.isEmpty()); - // Check if key size equals cfList size. If not a exception must be + // Check if key size equals cfList size. If not an exception must be // thrown. If not a Segmentation fault happens. if (keys.size() != columnFamilyHandleList.size()) { throw new IllegalArgumentException( @@ -2417,7 +2505,8 @@ public List multiGetByteBuffers(final ReadOptions readOptio // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (values.size() != keys.size()) { - throw new IllegalArgumentException("For each key there must be a corresponding value."); + throw new IllegalArgumentException("For each key there must be a corresponding value. " + + keys.size() + " keys were supplied, but " + values.size() + " values were supplied."); } // TODO (AP) support indirect buffers @@ -2467,6 +2556,12 @@ public List multiGetByteBuffers(final ReadOptions readOptio value.position(Math.min(valuesSizeArray[i], value.capacity())); value.flip(); // prepare for read out results.add(new ByteBufferGetStatus(status, valuesSizeArray[i], value)); + } else if (status.getCode() == Status.Code.Incomplete) { + assert valuesSizeArray[i] == -1; + final ByteBuffer value = valuesArray[i]; + value.position(value.capacity()); + value.flip(); // prepare for read out + results.add(new ByteBufferGetStatus(status, value.capacity(), value)); } else { results.add(new ByteBufferGetStatus(status)); } @@ -2973,7 +3068,7 @@ public boolean keyMayExist( final ReadOptions readOptions, final byte[] key, final int offset, final int len, /* @Nullable */ final Holder valueHolder) { - checkBounds(offset, len, key.length); + CheckBounds(offset, len, key.length); if (valueHolder == null) { return keyMayExist(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, @@ -3113,9 +3208,11 @@ public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions readOptions, final ByteBuffer key) { assert key != null : "key ByteBuffer parameter cannot be null"; assert key.isDirect() : "key parameter must be a direct ByteBuffer"; - return keyMayExistDirect(nativeHandle_, + final boolean result = keyMayExistDirect(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, readOptions == null ? 0 : readOptions.nativeHandle_, key, key.position(), key.limit()); + key.position(key.limit()); + return result; } /** @@ -3148,6 +3245,7 @@ public KeyMayExist keyMayExist(final ColumnFamilyHandle columnFamilyHandle, value, value.position(), value.remaining()); final int valueLength = result[1]; value.limit(value.position() + Math.min(valueLength, value.remaining())); + key.position(key.limit()); return new KeyMayExist(KeyMayExist.KeyMayExistEnum.values()[result[0]], valueLength); } @@ -3164,7 +3262,9 @@ public KeyMayExist keyMayExist(final ColumnFamilyHandle columnFamilyHandle, * @return instance of iterator object. */ public RocksIterator newIterator() { - return new RocksIterator(this, iterator(nativeHandle_)); + return new RocksIterator(this, + iterator(nativeHandle_, defaultColumnFamilyHandle_.nativeHandle_, + defaultReadOptions_.nativeHandle_)); } /** @@ -3181,8 +3281,9 @@ public RocksIterator newIterator() { * @return instance of iterator object. */ public RocksIterator newIterator(final ReadOptions readOptions) { - return new RocksIterator(this, iterator(nativeHandle_, - readOptions.nativeHandle_)); + return new RocksIterator(this, + iterator( + nativeHandle_, defaultColumnFamilyHandle_.nativeHandle_, readOptions.nativeHandle_)); } /** @@ -3201,8 +3302,9 @@ public RocksIterator newIterator(final ReadOptions readOptions) { */ public RocksIterator newIterator( final ColumnFamilyHandle columnFamilyHandle) { - return new RocksIterator(this, iteratorCF(nativeHandle_, - columnFamilyHandle.nativeHandle_)); + return new RocksIterator(this, + iterator( + nativeHandle_, columnFamilyHandle.nativeHandle_, defaultReadOptions_.nativeHandle_)); } /** @@ -3222,8 +3324,8 @@ public RocksIterator newIterator( */ public RocksIterator newIterator(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions readOptions) { - return new RocksIterator(this, iteratorCF(nativeHandle_, - columnFamilyHandle.nativeHandle_, readOptions.nativeHandle_)); + return new RocksIterator( + this, iterator(nativeHandle_, columnFamilyHandle.nativeHandle_, readOptions.nativeHandle_)); } /** @@ -4199,28 +4301,18 @@ public void disableFileDeletions() throws RocksDBException { } /** - *

Enable deleting obsolete files. - * If force == true, the call to EnableFileDeletions() - * will guarantee that file deletions are enabled after - * the call, even if DisableFileDeletions() was called - * multiple times before.

- * - *

If force == false, EnableFileDeletions will only - * enable file deletion after it's been called at least - * as many times as DisableFileDeletions(), enabling - * the two methods to be called by two threads - * concurrently without synchronization + *

EnableFileDeletions will only enable file deletion after + * it's been called at least as many times as DisableFileDeletions(), + * enabling the two methods to be called by two threads concurrently + * without synchronization * -- i.e., file deletions will be enabled only after both * threads call EnableFileDeletions()

* - * @param force boolean value described above. - * * @throws RocksDBException thrown if operation was not performed * successfully. */ - public void enableFileDeletions(final boolean force) - throws RocksDBException { - enableFileDeletions(nativeHandle_, force); + public void enableFileDeletions() throws RocksDBException { + enableFileDeletions(nativeHandle_); } public static class LiveFiles { @@ -4438,8 +4530,17 @@ public void verifyChecksum() throws RocksDBException { * @return The handle of the default column family */ public ColumnFamilyHandle getDefaultColumnFamily() { - final ColumnFamilyHandle cfHandle = new ColumnFamilyHandle(this, - getDefaultColumnFamily(nativeHandle_)); + return defaultColumnFamilyHandle_; + } + + /** + * Create a handle for the default CF on open + * + * @return the default family handle + */ + protected ColumnFamilyHandle makeDefaultColumnFamilyHandle() { + final ColumnFamilyHandle cfHandle = + new ColumnFamilyHandle(this, getDefaultColumnFamily(nativeHandle_)); cfHandle.disOwnNativeHandle(); return cfHandle; } @@ -4688,7 +4789,11 @@ protected void storeOptionsInstance(final DBOptionsInterface options) { options_ = options; } - private static void checkBounds(final int offset, final int len, final int size) { + protected void storeDefaultColumnFamilyHandle(ColumnFamilyHandle columnFamilyHandle) { + defaultColumnFamilyHandle_ = columnFamilyHandle; + } + + private static void checkBounds(int offset, int len, int size) { if ((offset | len | (offset + len) | (size - (offset + len))) < 0) { throw new IndexOutOfBoundsException(String.format("offset(%d), len(%d), size(%d)", offset, len, size)); } @@ -4739,281 +4844,239 @@ private static native long[] openAsSecondary(final long optionsHandle, final Str final String secondaryPath, final byte[][] columnFamilyNames, final long[] columnFamilyOptions) throws RocksDBException; - @Override protected native void disposeInternal(final long handle); + @Override + protected void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); private static native void closeDatabase(final long handle) throws RocksDBException; private static native byte[][] listColumnFamilies(final long optionsHandle, final String path) throws RocksDBException; - private native long createColumnFamily(final long handle, - final byte[] columnFamilyName, final int columnFamilyNamelen, - final long columnFamilyOptions) throws RocksDBException; - private native long[] createColumnFamilies(final long handle, - final long columnFamilyOptionsHandle, final byte[][] columnFamilyNames) + private static native long createColumnFamily(final long handle, final byte[] columnFamilyName, + final int columnFamilyNamelen, final long columnFamilyOptions) throws RocksDBException; + private static native long[] createColumnFamilies( + final long handle, final long columnFamilyOptionsHandle, final byte[][] columnFamilyNames) throws RocksDBException; - private native long[] createColumnFamilies( + private static native long[] createColumnFamilies( final long handle, final long[] columnFamilyOptionsHandles, final byte[][] columnFamilyNames) throws RocksDBException; - private native long createColumnFamilyWithImport(final long handle, final byte[] columnFamilyName, - final int columnFamilyNamelen, final long columnFamilyOptions, + private static native long createColumnFamilyWithImport(final long handle, + final byte[] columnFamilyName, final int columnFamilyNamelen, final long columnFamilyOptions, final long importColumnFamilyOptions, final long[] metadataHandleList) throws RocksDBException; - private native void dropColumnFamily( - final long handle, final long cfHandle) throws RocksDBException; - private native void dropColumnFamilies(final long handle, - final long[] cfHandles) throws RocksDBException; - private native void put(final long handle, final byte[] key, - final int keyOffset, final int keyLength, final byte[] value, - final int valueOffset, int valueLength) throws RocksDBException; - private native void put(final long handle, final byte[] key, final int keyOffset, - final int keyLength, final byte[] value, final int valueOffset, - final int valueLength, final long cfHandle) throws RocksDBException; - private native void put(final long handle, final long writeOptHandle, - final byte[] key, final int keyOffset, final int keyLength, - final byte[] value, final int valueOffset, final int valueLength) + private static native void dropColumnFamily(final long handle, final long cfHandle) throws RocksDBException; - private native void put(final long handle, final long writeOptHandle, - final byte[] key, final int keyOffset, final int keyLength, - final byte[] value, final int valueOffset, final int valueLength, - final long cfHandle) throws RocksDBException; - private native void delete(final long handle, final byte[] key, - final int keyOffset, final int keyLength) throws RocksDBException; - private native void delete(final long handle, final byte[] key, - final int keyOffset, final int keyLength, final long cfHandle) + private static native void dropColumnFamilies(final long handle, final long[] cfHandles) throws RocksDBException; - private native void delete(final long handle, final long writeOptHandle, - final byte[] key, final int keyOffset, final int keyLength) + private static native void put(final long handle, final byte[] key, final int keyOffset, + final int keyLength, final byte[] value, final int valueOffset, int valueLength) throws RocksDBException; - private native void delete(final long handle, final long writeOptHandle, - final byte[] key, final int keyOffset, final int keyLength, + private static native void put(final long handle, final byte[] key, final int keyOffset, + final int keyLength, final byte[] value, final int valueOffset, final int valueLength, final long cfHandle) throws RocksDBException; - private native void singleDelete( - final long handle, final byte[] key, final int keyLen) + private static native void put(final long handle, final long writeOptHandle, final byte[] key, + final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, + final int valueLength) throws RocksDBException; + private static native void put(final long handle, final long writeOptHandle, final byte[] key, + final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, + final int valueLength, final long cfHandle) throws RocksDBException; + private static native void delete(final long handle, final byte[] key, final int keyOffset, + final int keyLength) throws RocksDBException; + private static native void delete(final long handle, final byte[] key, final int keyOffset, + final int keyLength, final long cfHandle) throws RocksDBException; + private static native void delete(final long handle, final long writeOptHandle, final byte[] key, + final int keyOffset, final int keyLength) throws RocksDBException; + private static native void delete(final long handle, final long writeOptHandle, final byte[] key, + final int keyOffset, final int keyLength, final long cfHandle) throws RocksDBException; + private static native void singleDelete(final long handle, final byte[] key, final int keyLen) throws RocksDBException; - private native void singleDelete( - final long handle, final byte[] key, final int keyLen, + private static native void singleDelete(final long handle, final byte[] key, final int keyLen, final long cfHandle) throws RocksDBException; - private native void singleDelete( - final long handle, final long writeOptHandle, final byte[] key, - final int keyLen) throws RocksDBException; - private native void singleDelete( - final long handle, final long writeOptHandle, - final byte[] key, final int keyLen, final long cfHandle) - throws RocksDBException; - private native void deleteRange(final long handle, final byte[] beginKey, + private static native void singleDelete(final long handle, final long writeOptHandle, + final byte[] key, final int keyLen) throws RocksDBException; + private static native void singleDelete(final long handle, final long writeOptHandle, + final byte[] key, final int keyLen, final long cfHandle) throws RocksDBException; + private static native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyOffset, final int beginKeyLength, final byte[] endKey, final int endKeyOffset, final int endKeyLength) throws RocksDBException; - private native void deleteRange(final long handle, final byte[] beginKey, + private static native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyOffset, final int beginKeyLength, final byte[] endKey, - final int endKeyOffset, final int endKeyLength, final long cfHandle) - throws RocksDBException; - private native void deleteRange(final long handle, final long writeOptHandle, + final int endKeyOffset, final int endKeyLength, final long cfHandle) throws RocksDBException; + private static native void deleteRange(final long handle, final long writeOptHandle, final byte[] beginKey, final int beginKeyOffset, final int beginKeyLength, - final byte[] endKey, final int endKeyOffset, final int endKeyLength) - throws RocksDBException; - private native void deleteRange( - final long handle, final long writeOptHandle, final byte[] beginKey, - final int beginKeyOffset, final int beginKeyLength, final byte[] endKey, - final int endKeyOffset, final int endKeyLength, final long cfHandle) + final byte[] endKey, final int endKeyOffset, final int endKeyLength) throws RocksDBException; + private static native void deleteRange(final long handle, final long writeOptHandle, + final byte[] beginKey, final int beginKeyOffset, final int beginKeyLength, + final byte[] endKey, final int endKeyOffset, final int endKeyLength, final long cfHandle) throws RocksDBException; - private native void clipColumnFamily(final long handle, final long cfHandle, + private static native void clipColumnFamily(final long handle, final long cfHandle, final byte[] beginKey, final int beginKeyOffset, final int beginKeyLength, final byte[] endKey, final int endKeyOffset, final int endKeyLength) throws RocksDBException; - private native void merge(final long handle, final byte[] key, - final int keyOffset, final int keyLength, final byte[] value, - final int valueOffset, final int valueLength) throws RocksDBException; - private native void merge(final long handle, final byte[] key, - final int keyOffset, final int keyLength, final byte[] value, - final int valueOffset, final int valueLength, final long cfHandle) + private static native void merge(final long handle, final byte[] key, final int keyOffset, + final int keyLength, final byte[] value, final int valueOffset, final int valueLength) throws RocksDBException; - private native void merge(final long handle, final long writeOptHandle, - final byte[] key, final int keyOffset, final int keyLength, - final byte[] value, final int valueOffset, final int valueLength) - throws RocksDBException; - private native void merge(final long handle, final long writeOptHandle, - final byte[] key, final int keyOffset, final int keyLength, - final byte[] value, final int valueOffset, final int valueLength, + private static native void merge(final long handle, final byte[] key, final int keyOffset, + final int keyLength, final byte[] value, final int valueOffset, final int valueLength, final long cfHandle) throws RocksDBException; - private native void write0(final long handle, final long writeOptHandle, - final long wbHandle) throws RocksDBException; - private native void write1(final long handle, final long writeOptHandle, - final long wbwiHandle) throws RocksDBException; - private native int get(final long handle, final byte[] key, - final int keyOffset, final int keyLength, final byte[] value, - final int valueOffset, final int valueLength) throws RocksDBException; - private native int get(final long handle, final byte[] key, - final int keyOffset, final int keyLength, byte[] value, - final int valueOffset, final int valueLength, final long cfHandle) - throws RocksDBException; - private native int get(final long handle, final long readOptHandle, - final byte[] key, final int keyOffset, final int keyLength, - final byte[] value, final int valueOffset, final int valueLength) + private static native void merge(final long handle, final long writeOptHandle, final byte[] key, + final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, + final int valueLength) throws RocksDBException; + private static native void merge(final long handle, final long writeOptHandle, final byte[] key, + final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, + final int valueLength, final long cfHandle) throws RocksDBException; + private static native void mergeDirect(long handle, long writeOptHandle, ByteBuffer key, + int keyOffset, int keyLength, ByteBuffer value, int valueOffset, int valueLength, + long cfHandle) throws RocksDBException; + + private static native void write0( + final long handle, final long writeOptHandle, final long wbHandle) throws RocksDBException; + private static native void write1( + final long handle, final long writeOptHandle, final long wbwiHandle) throws RocksDBException; + private static native int get(final long handle, final byte[] key, final int keyOffset, + final int keyLength, final byte[] value, final int valueOffset, final int valueLength) throws RocksDBException; - private native int get(final long handle, final long readOptHandle, - final byte[] key, final int keyOffset, final int keyLength, - final byte[] value, final int valueOffset, final int valueLength, + private static native int get(final long handle, final byte[] key, final int keyOffset, + final int keyLength, byte[] value, final int valueOffset, final int valueLength, final long cfHandle) throws RocksDBException; - private native byte[] get(final long handle, byte[] key, final int keyOffset, + private static native int get(final long handle, final long readOptHandle, final byte[] key, + final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, + final int valueLength) throws RocksDBException; + private static native int get(final long handle, final long readOptHandle, final byte[] key, + final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, + final int valueLength, final long cfHandle) throws RocksDBException; + private static native byte[] get(final long handle, byte[] key, final int keyOffset, final int keyLength) throws RocksDBException; - private native byte[] get(final long handle, final byte[] key, - final int keyOffset, final int keyLength, final long cfHandle) - throws RocksDBException; - private native byte[] get(final long handle, final long readOptHandle, - final byte[] key, final int keyOffset, final int keyLength) - throws RocksDBException; - private native byte[] get(final long handle, - final long readOptHandle, final byte[] key, final int keyOffset, + private static native byte[] get(final long handle, final byte[] key, final int keyOffset, final int keyLength, final long cfHandle) throws RocksDBException; - private native byte[][] multiGet(final long dbHandle, final byte[][] keys, - final int[] keyOffsets, final int[] keyLengths); - private native byte[][] multiGet(final long dbHandle, final byte[][] keys, - final int[] keyOffsets, final int[] keyLengths, - final long[] columnFamilyHandles); - private native byte[][] multiGet(final long dbHandle, final long rOptHandle, + private static native byte[] get(final long handle, final long readOptHandle, final byte[] key, + final int keyOffset, final int keyLength) throws RocksDBException; + private static native byte[] get(final long handle, final long readOptHandle, final byte[] key, + final int keyOffset, final int keyLength, final long cfHandle) throws RocksDBException; + private static native byte[][] multiGet( + final long dbHandle, final byte[][] keys, final int[] keyOffsets, final int[] keyLengths); + private static native byte[][] multiGet(final long dbHandle, final byte[][] keys, + final int[] keyOffsets, final int[] keyLengths, final long[] columnFamilyHandles); + private static native byte[][] multiGet(final long dbHandle, final long rOptHandle, final byte[][] keys, final int[] keyOffsets, final int[] keyLengths); - private native byte[][] multiGet(final long dbHandle, final long rOptHandle, + private static native byte[][] multiGet(final long dbHandle, final long rOptHandle, final byte[][] keys, final int[] keyOffsets, final int[] keyLengths, final long[] columnFamilyHandles); - private native void multiGet(final long dbHandle, final long rOptHandle, + private static native void multiGet(final long dbHandle, final long rOptHandle, final long[] columnFamilyHandles, final ByteBuffer[] keysArray, final int[] keyOffsets, final int[] keyLengths, final ByteBuffer[] valuesArray, final int[] valuesSizeArray, final Status[] statusArray); - private native boolean keyExists(final long handle, final long cfHandle, final long readOptHandle, - final byte[] key, final int keyOffset, final int keyLength); + private static native boolean keyExists(final long handle, final long cfHandle, + final long readOptHandle, final byte[] key, final int keyOffset, final int keyLength); - private native boolean keyExistsDirect(final long handle, final long cfHandle, + private static native boolean keyExistsDirect(final long handle, final long cfHandle, final long readOptHandle, final ByteBuffer key, final int keyOffset, final int keyLength); - private native boolean keyMayExist( - final long handle, final long cfHandle, final long readOptHandle, - final byte[] key, final int keyOffset, final int keyLength); - private native byte[][] keyMayExistFoundValue( - final long handle, final long cfHandle, final long readOptHandle, - final byte[] key, final int keyOffset, final int keyLength); - private native void putDirect(long handle, long writeOptHandle, ByteBuffer key, int keyOffset, - int keyLength, ByteBuffer value, int valueOffset, int valueLength, long cfHandle) - throws RocksDBException; - private native long iterator(final long handle); - private native long iterator(final long handle, final long readOptHandle); - private native long iteratorCF(final long handle, final long cfHandle); - private native long iteratorCF(final long handle, final long cfHandle, - final long readOptHandle); - private native long[] iterators(final long handle, - final long[] columnFamilyHandles, final long readOptHandle) - throws RocksDBException; - private native long getSnapshot(final long nativeHandle); - private native void releaseSnapshot( - final long nativeHandle, final long snapshotHandle); - private native String getProperty(final long nativeHandle, - final long cfHandle, final String property, final int propertyLength) - throws RocksDBException; - private native Map getMapProperty(final long nativeHandle, - final long cfHandle, final String property, final int propertyLength) - throws RocksDBException; - private native int getDirect(long handle, long readOptHandle, ByteBuffer key, int keyOffset, - int keyLength, ByteBuffer value, int valueOffset, int valueLength, long cfHandle) - throws RocksDBException; - private native boolean keyMayExistDirect(final long handle, final long cfHhandle, + private static native boolean keyMayExist(final long handle, final long cfHandle, + final long readOptHandle, final byte[] key, final int keyOffset, final int keyLength); + private static native byte[][] keyMayExistFoundValue(final long handle, final long cfHandle, + final long readOptHandle, final byte[] key, final int keyOffset, final int keyLength); + private static native void putDirect(long handle, long writeOptHandle, ByteBuffer key, + int keyOffset, int keyLength, ByteBuffer value, int valueOffset, int valueLength, + long cfHandle) throws RocksDBException; + private static native long iterator( + final long handle, final long cfHandle, final long readOptHandle); + private static native long[] iterators(final long handle, final long[] columnFamilyHandles, + final long readOptHandle) throws RocksDBException; + + private static native long getSnapshot(final long nativeHandle); + private static native void releaseSnapshot(final long nativeHandle, final long snapshotHandle); + private static native String getProperty(final long nativeHandle, final long cfHandle, + final String property, final int propertyLength) throws RocksDBException; + private static native Map getMapProperty(final long nativeHandle, + final long cfHandle, final String property, final int propertyLength) throws RocksDBException; + private static native int getDirect(long handle, long readOptHandle, ByteBuffer key, + int keyOffset, int keyLength, ByteBuffer value, int valueOffset, int valueLength, + long cfHandle) throws RocksDBException; + private static native boolean keyMayExistDirect(final long handle, final long cfHhandle, final long readOptHandle, final ByteBuffer key, final int keyOffset, final int keyLength); - private native int[] keyMayExistDirectFoundValue(final long handle, final long cfHhandle, + private static native int[] keyMayExistDirectFoundValue(final long handle, final long cfHhandle, final long readOptHandle, final ByteBuffer key, final int keyOffset, final int keyLength, final ByteBuffer value, final int valueOffset, final int valueLength); - private native void deleteDirect(long handle, long optHandle, ByteBuffer key, int keyOffset, - int keyLength, long cfHandle) throws RocksDBException; - private native long getLongProperty(final long nativeHandle, - final long cfHandle, final String property, final int propertyLength) - throws RocksDBException; - private native void resetStats(final long nativeHandle) - throws RocksDBException; - private native long getAggregatedLongProperty(final long nativeHandle, - final String property, int propertyLength) throws RocksDBException; - private native long[] getApproximateSizes(final long nativeHandle, - final long columnFamilyHandle, final long[] rangeSliceHandles, - final byte includeFlags); - private native long[] getApproximateMemTableStats(final long nativeHandle, + private static native void deleteDirect(long handle, long optHandle, ByteBuffer key, + int keyOffset, int keyLength, long cfHandle) throws RocksDBException; + private static native long getLongProperty(final long nativeHandle, final long cfHandle, + final String property, final int propertyLength) throws RocksDBException; + private static native void resetStats(final long nativeHandle) throws RocksDBException; + private static native long getAggregatedLongProperty( + final long nativeHandle, final String property, int propertyLength) throws RocksDBException; + private static native long[] getApproximateSizes(final long nativeHandle, + final long columnFamilyHandle, final long[] rangeSliceHandles, final byte includeFlags); + private static native long[] getApproximateMemTableStats(final long nativeHandle, final long columnFamilyHandle, final long rangeStartSliceHandle, final long rangeLimitSliceHandle); - private native void compactRange(final long handle, + private static native void compactRange(final long handle, /* @Nullable */ final byte[] begin, final int beginLen, - /* @Nullable */ final byte[] end, final int endLen, - final long compactRangeOptHandle, final long cfHandle) - throws RocksDBException; - private native void setOptions(final long handle, final long cfHandle, - final String[] keys, final String[] values) throws RocksDBException; - private native String getOptions(final long handle, final long cfHandle); - private native void setDBOptions(final long handle, - final String[] keys, final String[] values) throws RocksDBException; - private native String getDBOptions(final long handle); - private native void setPerfLevel(final byte level); - private native byte getPerfLevelNative(); - - private native long getPerfContextNative(); - - private native String[] compactFiles(final long handle, - final long compactionOptionsHandle, - final long columnFamilyHandle, - final String[] inputFileNames, - final int outputLevel, - final int outputPathId, - final long compactionJobInfoHandle) throws RocksDBException; - private native void cancelAllBackgroundWork(final long handle, - final boolean wait); - private native void pauseBackgroundWork(final long handle) - throws RocksDBException; - private native void continueBackgroundWork(final long handle) - throws RocksDBException; - private native void enableAutoCompaction(final long handle, - final long[] columnFamilyHandles) throws RocksDBException; - private native int numberLevels(final long handle, - final long columnFamilyHandle); - private native int maxMemCompactionLevel(final long handle, - final long columnFamilyHandle); - private native int level0StopWriteTrigger(final long handle, - final long columnFamilyHandle); - private native String getName(final long handle); - private native long getEnv(final long handle); - private native void flush(final long handle, final long flushOptHandle, + /* @Nullable */ final byte[] end, final int endLen, final long compactRangeOptHandle, + final long cfHandle) throws RocksDBException; + private static native void setOptions(final long handle, final long cfHandle, final String[] keys, + final String[] values) throws RocksDBException; + private static native String getOptions(final long handle, final long cfHandle); + private static native void setDBOptions( + final long handle, final String[] keys, final String[] values) throws RocksDBException; + private static native String getDBOptions(final long handle); + private static native void setPerfLevel(final byte level); + private static native byte getPerfLevelNative(); + + private static native long getPerfContextNative(); + + private static native String[] compactFiles(final long handle, final long compactionOptionsHandle, + final long columnFamilyHandle, final String[] inputFileNames, final int outputLevel, + final int outputPathId, final long compactionJobInfoHandle) throws RocksDBException; + private static native void cancelAllBackgroundWork(final long handle, final boolean wait); + private static native void pauseBackgroundWork(final long handle) throws RocksDBException; + private static native void continueBackgroundWork(final long handle) throws RocksDBException; + private static native void enableAutoCompaction( + final long handle, final long[] columnFamilyHandles) throws RocksDBException; + private static native int numberLevels(final long handle, final long columnFamilyHandle); + private static native int maxMemCompactionLevel(final long handle, final long columnFamilyHandle); + private static native int level0StopWriteTrigger( + final long handle, final long columnFamilyHandle); + private static native String getName(final long handle); + private static native long getEnv(final long handle); + private static native void flush(final long handle, final long flushOptHandle, /* @Nullable */ final long[] cfHandles) throws RocksDBException; - private native void flushWal(final long handle, final boolean sync) + private static native void flushWal(final long handle, final boolean sync) throws RocksDBException; - private native void syncWal(final long handle) throws RocksDBException; - private native long getLatestSequenceNumber(final long handle); - private native void disableFileDeletions(long handle) throws RocksDBException; - private native void enableFileDeletions(long handle, boolean force) + private static native void syncWal(final long handle) throws RocksDBException; + private static native long getLatestSequenceNumber(final long handle); + private static native void disableFileDeletions(long handle) throws RocksDBException; + private static native void enableFileDeletions(long handle) throws RocksDBException; + private static native String[] getLiveFiles(final long handle, final boolean flushMemtable) throws RocksDBException; - private native String[] getLiveFiles(final long handle, - final boolean flushMemtable) throws RocksDBException; - private native LogFile[] getSortedWalFiles(final long handle) + private static native LogFile[] getSortedWalFiles(final long handle) throws RocksDBException; + private static native long getUpdatesSince(final long handle, final long sequenceNumber) throws RocksDBException; - private native long getUpdatesSince(final long handle, - final long sequenceNumber) throws RocksDBException; - private native void deleteFile(final long handle, final String name) + private static native void deleteFile(final long handle, final String name) throws RocksDBException; - private native LiveFileMetaData[] getLiveFilesMetaData(final long handle); - private native ColumnFamilyMetaData getColumnFamilyMetaData( + private static native LiveFileMetaData[] getLiveFilesMetaData(final long handle); + private static native ColumnFamilyMetaData getColumnFamilyMetaData( final long handle, final long columnFamilyHandle); - private native void ingestExternalFile(final long handle, - final long columnFamilyHandle, final String[] filePathList, - final int filePathListLen, final long ingestExternalFileOptionsHandle) - throws RocksDBException; - private native void verifyChecksum(final long handle) throws RocksDBException; - private native long getDefaultColumnFamily(final long handle); - private native Map getPropertiesOfAllTables( + private static native void ingestExternalFile(final long handle, final long columnFamilyHandle, + final String[] filePathList, final int filePathListLen, + final long ingestExternalFileOptionsHandle) throws RocksDBException; + private static native void verifyChecksum(final long handle) throws RocksDBException; + private static native long getDefaultColumnFamily(final long handle); + private static native Map getPropertiesOfAllTables( final long handle, final long columnFamilyHandle) throws RocksDBException; - private native Map getPropertiesOfTablesInRange( - final long handle, final long columnFamilyHandle, - final long[] rangeSliceHandles); - private native long[] suggestCompactRange(final long handle, - final long columnFamilyHandle) throws RocksDBException; - private native void promoteL0(final long handle, - final long columnFamilyHandle, final int tragetLevel) + private static native Map getPropertiesOfTablesInRange( + final long handle, final long columnFamilyHandle, final long[] rangeSliceHandles); + private static native long[] suggestCompactRange(final long handle, final long columnFamilyHandle) throws RocksDBException; - private native void startTrace(final long handle, final long maxTraceFileSize, + private static native void promoteL0(final long handle, final long columnFamilyHandle, + final int tragetLevel) throws RocksDBException; + private static native void startTrace(final long handle, final long maxTraceFileSize, final long traceWriterHandle) throws RocksDBException; - private native void endTrace(final long handle) throws RocksDBException; - private native void tryCatchUpWithPrimary(final long handle) throws RocksDBException; - private native void deleteFilesInRanges(long handle, long cfHandle, final byte[][] ranges, + private static native void endTrace(final long handle) throws RocksDBException; + private static native void tryCatchUpWithPrimary(final long handle) throws RocksDBException; + private static native void deleteFilesInRanges(long handle, long cfHandle, final byte[][] ranges, boolean include_end) throws RocksDBException; private static native void destroyDB(final String path, final long optionsHandle) diff --git a/java/src/main/java/org/rocksdb/RocksEnv.java b/java/src/main/java/org/rocksdb/RocksEnv.java index ca010c9f9c5..eba5b25532a 100644 --- a/java/src/main/java/org/rocksdb/RocksEnv.java +++ b/java/src/main/java/org/rocksdb/RocksEnv.java @@ -27,5 +27,9 @@ public class RocksEnv extends Env { super(handle); } - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/RocksIterator.java b/java/src/main/java/org/rocksdb/RocksIterator.java index 20e56d2ebb0..8e331d51845 100644 --- a/java/src/main/java/org/rocksdb/RocksIterator.java +++ b/java/src/main/java/org/rocksdb/RocksIterator.java @@ -5,6 +5,8 @@ package org.rocksdb; +import static org.rocksdb.util.BufferUtil.CheckBounds; + import java.nio.ByteBuffer; /** @@ -39,6 +41,45 @@ public byte[] key() { return key0(nativeHandle_); } + /** + *

Return the key for the current entry. The underlying storage for + * the returned slice is valid only until the next modification of + * the iterator.

+ * + *

REQUIRES: {@link #isValid()}

+ * + * @param key the out-value to receive the retrieved key. + * @return The size of the actual key. If the return key is greater than + * the length of the buffer {@code key}, then it indicates that the size of the + * input buffer {@code key} is insufficient and partial result will + * be returned. + */ + public int key(final byte[] key) { + assert isOwningHandle(); + return keyByteArray0(nativeHandle_, key, 0, key.length); + } + + /** + *

Return the key for the current entry. The underlying storage for + * the returned slice is valid only until the next modification of + * the iterator.

+ * + *

REQUIRES: {@link #isValid()}

+ * + * @param key the out-value to receive the retrieved key. + * @param offset in {@code key} at which to place the retrieved key + * @param len limit to length of received key returned + * @return The size of the actual key. If the return key is greater than + * {@code len}, then it indicates that the size of the + * input buffer {@code key} is insufficient and partial result will + * be returned. + */ + public int key(final byte[] key, final int offset, final int len) { + assert isOwningHandle(); + CheckBounds(offset, len, key.length); + return keyByteArray0(nativeHandle_, key, offset, len); + } + /** *

Return the key for the current entry. The underlying storage for * the returned slice is valid only until the next modification of @@ -48,7 +89,6 @@ public byte[] key() { * * @param key the out-value to receive the retrieved key. * It is using position and limit. Limit is set according to key size. - * Supports direct buffer only. * @return The size of the actual key. If the return key is greater than the * length of {@code key}, then it indicates that the size of the * input buffer {@code key} is insufficient and partial result will @@ -90,7 +130,6 @@ public byte[] value() { * * @param value the out-value to receive the retrieved value. * It is using position and limit. Limit is set according to value size. - * Supports direct buffer only. * @return The size of the actual value. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will @@ -110,31 +149,132 @@ public int value(final ByteBuffer value) { return result; } - @Override protected final native void disposeInternal(final long handle); - @Override final native boolean isValid0(long handle); - @Override final native void seekToFirst0(long handle); - @Override final native void seekToLast0(long handle); - @Override final native void next0(long handle); - @Override final native void prev0(long handle); - @Override final native void refresh0(long handle); - @Override final native void seek0(long handle, byte[] target, int targetLen); - @Override final native void seekForPrev0(long handle, byte[] target, int targetLen); + /** + *

Return the value for the current entry. The underlying storage for + * the returned slice is valid only until the next modification of + * the iterator.

+ * + *

REQUIRES: {@link #isValid()}

+ * + * @param value the out-value to receive the retrieved value. + * @return The size of the actual value. If the return value is greater than the + * length of {@code value}, then it indicates that the size of the + * input buffer {@code value} is insufficient and partial result will + * be returned. + */ + public int value(final byte[] value) { + assert isOwningHandle(); + return valueByteArray0(nativeHandle_, value, 0, value.length); + } + + /** + *

Return the value for the current entry. The underlying storage for + * the returned slice is valid only until the next modification of + * the iterator.

+ * + *

REQUIRES: {@link #isValid()}

+ * + * @param value the out-value to receive the retrieved value. + * @param offset the offset within value at which to place the result + * @param len the length available in value after offset, for placing the result + * @return The size of the actual value. If the return value is greater than {@code len}, + * then it indicates that the size of the + * input buffer {@code value} is insufficient and partial result will + * be returned. + */ + public int value(final byte[] value, final int offset, final int len) { + assert isOwningHandle(); + CheckBounds(offset, len, value.length); + return valueByteArray0(nativeHandle_, value, offset, len); + } + + @Override final native void refresh1(long handle, long snapshotHandle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); @Override - final native void seekDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen); + final boolean isValid0(long handle) { + return isValid0Jni(handle); + } + private static native boolean isValid0Jni(long handle); + @Override - final native void seekByteArray0(long handle, byte[] target, int targetOffset, int targetLen); + final void seekToFirst0(long handle) { + seekToFirst0Jni(handle); + } + private static native void seekToFirst0Jni(long handle); + + @Override + final void seekToLast0(long handle) { + seekToLast0Jni(handle); + } + private static native void seekToLast0Jni(long handle); + @Override - final native void seekForPrevDirect0( + final void next0(long handle) { + next0Jni(handle); + } + private static native void next0Jni(long handle); + + @Override + final void prev0(long handle) { + prev0Jni(handle); + } + private static native void prev0Jni(long handle); + @Override + final void refresh0(long handle) { + refresh0Jni(handle); + } + private static native void refresh0Jni(long handle); + @Override + final void seek0(long handle, byte[] target, int targetLen) { + seek0Jni(handle, target, targetLen); + } + private static native void seek0Jni(long handle, byte[] target, int targetLen); + @Override + final void seekForPrev0(long handle, byte[] target, int targetLen) { + seekForPrev0Jni(handle, target, targetLen); + } + private static native void seekForPrev0Jni(long handle, byte[] target, int targetLen); + @Override + final void seekDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen) { + seekDirect0Jni(handle, target, targetOffset, targetLen); + } + private static native void seekDirect0Jni( long handle, ByteBuffer target, int targetOffset, int targetLen); @Override - final native void seekForPrevByteArray0( + final void seekByteArray0(long handle, byte[] target, int targetOffset, int targetLen) { + seekByteArray0Jni(handle, target, targetOffset, targetLen); + } + private static native void seekByteArray0Jni( long handle, byte[] target, int targetOffset, int targetLen); - @Override final native void status0(long handle) throws RocksDBException; - - private native byte[] key0(long handle); - private native byte[] value0(long handle); - private native int keyDirect0(long handle, ByteBuffer buffer, int bufferOffset, int bufferLen); - private native int keyByteArray0(long handle, byte[] array, int arrayOffset, int arrayLen); - private native int valueDirect0(long handle, ByteBuffer buffer, int bufferOffset, int bufferLen); - private native int valueByteArray0(long handle, byte[] array, int arrayOffset, int arrayLen); + @Override + final void seekForPrevDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen) { + seekForPrevDirect0Jni(handle, target, targetOffset, targetLen); + } + private static native void seekForPrevDirect0Jni( + long handle, ByteBuffer target, int targetOffset, int targetLen); + @Override + final void seekForPrevByteArray0(long handle, byte[] target, int targetOffset, int targetLen) { + seekForPrevByteArray0Jni(handle, target, targetOffset, targetLen); + } + private static native void seekForPrevByteArray0Jni( + long handle, byte[] target, int targetOffset, int targetLen); + @Override + final void status0(long handle) throws RocksDBException { + status0Jni(handle); + } + private static native void status0Jni(long handle) throws RocksDBException; + + private static native byte[] key0(long handle); + private static native byte[] value0(long handle); + private static native int keyDirect0( + long handle, ByteBuffer buffer, int bufferOffset, int bufferLen); + private static native int keyByteArray0(long handle, byte[] array, int arrayOffset, int arrayLen); + private static native int valueDirect0( + long handle, ByteBuffer buffer, int bufferOffset, int bufferLen); + private static native int valueByteArray0( + long handle, byte[] array, int arrayOffset, int arrayLen); } diff --git a/java/src/main/java/org/rocksdb/RocksIteratorInterface.java b/java/src/main/java/org/rocksdb/RocksIteratorInterface.java index 819c21c2c30..78f35e3f86a 100644 --- a/java/src/main/java/org/rocksdb/RocksIteratorInterface.java +++ b/java/src/main/java/org/rocksdb/RocksIteratorInterface.java @@ -116,12 +116,23 @@ public interface RocksIteratorInterface { void status() throws RocksDBException; /** - *

If supported, renew the iterator to represent the latest state. The iterator will be - * invalidated after the call. Not supported if {@link ReadOptions#setSnapshot(Snapshot)} was - * specified when creating the iterator.

+ *

If supported, the DB state that the iterator reads from is updated to + * the latest state. The iterator will be invalidated after the call. + * Regardless of whether the iterator was created/refreshed previously with + * or without a snapshot, the iterator will be reading the latest DB state + * after this call.

+ *

Note that you will need to call a Seek*() function to get the iterator + * back into a valid state before calling a function that assumes the + * state is already valid, like Next().

* * @throws RocksDBException thrown if the operation is not supported or an error happens in the * underlying native library */ void refresh() throws RocksDBException; + + /** + * Similar to {@link #refresh()} but the iterator will be reading the latest DB state under the + * given snapshot. + */ + void refresh(Snapshot snapshot) throws RocksDBException; } diff --git a/java/src/main/java/org/rocksdb/RocksMemEnv.java b/java/src/main/java/org/rocksdb/RocksMemEnv.java index 39a6f6e1c66..be04a90031e 100644 --- a/java/src/main/java/org/rocksdb/RocksMemEnv.java +++ b/java/src/main/java/org/rocksdb/RocksMemEnv.java @@ -27,5 +27,9 @@ public RocksMemEnv(final Env baseEnv) { } private static native long createMemEnv(final long baseEnvHandle); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java b/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java index e2c1b97d894..b9d14392909 100644 --- a/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java +++ b/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java @@ -44,7 +44,7 @@ public long lookahead() { return newMemTableFactoryHandle0(lookahead_); } - private native long newMemTableFactoryHandle0(long lookahead) + private static native long newMemTableFactoryHandle0(long lookahead) throws IllegalArgumentException; private long lookahead_; diff --git a/java/src/main/java/org/rocksdb/Slice.java b/java/src/main/java/org/rocksdb/Slice.java index 6a01374d655..54d1d1bbf58 100644 --- a/java/src/main/java/org/rocksdb/Slice.java +++ b/java/src/main/java/org/rocksdb/Slice.java @@ -127,9 +127,7 @@ protected void disposeInternal() { @Override protected final native byte[] data0(long handle); private static native long createNewSlice0(final byte[] data, final int length); private static native long createNewSlice1(final byte[] data); - private native void clear0(long handle, boolean internalBuffer, - long internalBufferOffset); - private native void removePrefix0(long handle, int length); - private native void disposeInternalBuf(final long handle, - long internalBufferOffset); + private static native void clear0(long handle, boolean internalBuffer, long internalBufferOffset); + private static native void removePrefix0(long handle, int length); + private static native void disposeInternalBuf(final long handle, long internalBufferOffset); } diff --git a/java/src/main/java/org/rocksdb/Snapshot.java b/java/src/main/java/org/rocksdb/Snapshot.java index 1f471bd31a4..e0ce73cef19 100644 --- a/java/src/main/java/org/rocksdb/Snapshot.java +++ b/java/src/main/java/org/rocksdb/Snapshot.java @@ -37,5 +37,5 @@ protected final void disposeInternal(final long handle) { */ } - private native long getSequenceNumber(long handle); + private static native long getSequenceNumber(long handle); } diff --git a/java/src/main/java/org/rocksdb/SstFileManager.java b/java/src/main/java/org/rocksdb/SstFileManager.java index 0b9a60061f0..efce94db24f 100644 --- a/java/src/main/java/org/rocksdb/SstFileManager.java +++ b/java/src/main/java/org/rocksdb/SstFileManager.java @@ -231,19 +231,20 @@ public void setMaxTrashDBRatio(final double ratio) { private static native long newSstFileManager(final long handle, final long logger_handle, final long rateBytesPerSec, final double maxTrashDbRatio, final long bytesMaxDeleteChunk) throws RocksDBException; - private native void setMaxAllowedSpaceUsage(final long handle, - final long maxAllowedSpace); - private native void setCompactionBufferSize(final long handle, - final long compactionBufferSize); - private native boolean isMaxAllowedSpaceReached(final long handle); - private native boolean isMaxAllowedSpaceReachedIncludingCompactions( - final long handle); - private native long getTotalSize(final long handle); - private native Map getTrackedFiles(final long handle); - private native long getDeleteRateBytesPerSecond(final long handle); - private native void setDeleteRateBytesPerSecond(final long handle, - final long deleteRate); - private native double getMaxTrashDBRatio(final long handle); - private native void setMaxTrashDBRatio(final long handle, final double ratio); - @Override protected native void disposeInternal(final long handle); + private static native void setMaxAllowedSpaceUsage(final long handle, final long maxAllowedSpace); + private static native void setCompactionBufferSize( + final long handle, final long compactionBufferSize); + private static native boolean isMaxAllowedSpaceReached(final long handle); + private static native boolean isMaxAllowedSpaceReachedIncludingCompactions(final long handle); + private static native long getTotalSize(final long handle); + private static native Map getTrackedFiles(final long handle); + private static native long getDeleteRateBytesPerSecond(final long handle); + private static native void setDeleteRateBytesPerSecond(final long handle, final long deleteRate); + private static native double getMaxTrashDBRatio(final long handle); + private static native void setMaxTrashDBRatio(final long handle, final double ratio); + @Override + protected void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/SstFileMetaData.java b/java/src/main/java/org/rocksdb/SstFileMetaData.java index 88ea8152a6a..6025d0b422a 100644 --- a/java/src/main/java/org/rocksdb/SstFileMetaData.java +++ b/java/src/main/java/org/rocksdb/SstFileMetaData.java @@ -20,6 +20,7 @@ public class SstFileMetaData { private final boolean beingCompacted; private final long numEntries; private final long numDeletions; + private final byte[] fileChecksum; /** * Called from JNI C++ @@ -35,12 +36,13 @@ public class SstFileMetaData { * @param beingCompacted true if the file is being compacted, false otherwise * @param numEntries the number of entries * @param numDeletions the number of deletions + * @param fileChecksum the full file checksum (if enabled) */ @SuppressWarnings("PMD.ArrayIsStoredDirectly") protected SstFileMetaData(final String fileName, final String path, final long size, final long smallestSeqno, final long largestSeqno, final byte[] smallestKey, final byte[] largestKey, final long numReadsSampled, final boolean beingCompacted, - final long numEntries, final long numDeletions) { + final long numEntries, final long numDeletions, final byte[] fileChecksum) { this.fileName = fileName; this.path = path; this.size = size; @@ -52,6 +54,7 @@ protected SstFileMetaData(final String fileName, final String path, final long s this.beingCompacted = beingCompacted; this.numEntries = numEntries; this.numDeletions = numDeletions; + this.fileChecksum = fileChecksum; } /** @@ -154,4 +157,14 @@ public long numEntries() { public long numDeletions() { return numDeletions; } + + /** + * Get the full file checksum iff full file checksum is enabled. + * + * @return the file's checksum + */ + @SuppressWarnings("PMD.MethodReturnsInternalArray") + public byte[] fileChecksum() { + return fileChecksum; + } } diff --git a/java/src/main/java/org/rocksdb/SstFileReader.java b/java/src/main/java/org/rocksdb/SstFileReader.java index 939d3937536..46bebf1dd2e 100644 --- a/java/src/main/java/org/rocksdb/SstFileReader.java +++ b/java/src/main/java/org/rocksdb/SstFileReader.java @@ -65,14 +65,17 @@ public TableProperties getTableProperties() throws RocksDBException { return getTableProperties(nativeHandle_); } - @Override protected final native void disposeInternal(final long handle); - private native long newIterator(final long handle, final long readOptionsHandle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + private static native long newIterator(final long handle, final long readOptionsHandle); - private native void open(final long handle, final String filePath) - throws RocksDBException; + private static native void open(final long handle, final String filePath) throws RocksDBException; private static native long newSstFileReader(final long optionsHandle); - private native void verifyChecksum(final long handle) throws RocksDBException; - private native TableProperties getTableProperties(final long handle) + private static native void verifyChecksum(final long handle) throws RocksDBException; + private static native TableProperties getTableProperties(final long handle) throws RocksDBException; } diff --git a/java/src/main/java/org/rocksdb/SstFileReaderIterator.java b/java/src/main/java/org/rocksdb/SstFileReaderIterator.java index a4a08167b18..31f2f393aaf 100644 --- a/java/src/main/java/org/rocksdb/SstFileReaderIterator.java +++ b/java/src/main/java/org/rocksdb/SstFileReaderIterator.java @@ -108,33 +108,95 @@ public int value(final ByteBuffer value) { return result; } - @Override protected final native void disposeInternal(final long handle); - @Override final native boolean isValid0(long handle); - @Override final native void seekToFirst0(long handle); - @Override final native void seekToLast0(long handle); - @Override final native void next0(long handle); - @Override final native void prev0(long handle); - @Override final native void refresh0(long handle) throws RocksDBException; - @Override final native void seek0(long handle, byte[] target, int targetLen); - @Override final native void seekForPrev0(long handle, byte[] target, int targetLen); - @Override final native void status0(long handle) throws RocksDBException; + @Override final native void refresh1(long handle, long snapshotHandle); @Override - final native void seekDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen); + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + + @Override + final boolean isValid0(long handle) { + return isValid0Jni(handle); + } + private static native boolean isValid0Jni(long handle); + @Override + final void seekToFirst0(long handle) { + seekToFirst0Jni(handle); + } + private static native void seekToFirst0Jni(long handle); + @Override + final void seekToLast0(long handle) { + seekToLast0Jni(handle); + } + private static native void seekToLast0Jni(long handle); + @Override + final void next0(long handle) { + next0Jni(handle); + } + private static native void next0Jni(long handle); + @Override + final void prev0(long handle) { + prev0Jni(handle); + } + private static native void prev0Jni(long handle); + @Override + final void refresh0(long handle) throws RocksDBException { + refresh0Jni(handle); + } + private static native void refresh0Jni(long handle) throws RocksDBException; @Override - final native void seekForPrevDirect0( + final void seek0(long handle, byte[] target, int targetLen) { + seek0Jni(handle, target, targetLen); + } + private static native void seek0Jni(long handle, byte[] target, int targetLen); + @Override + final void seekForPrev0(long handle, byte[] target, int targetLen) { + seekForPrev0Jni(handle, target, targetLen); + } + private static native void seekForPrev0Jni(long handle, byte[] target, int targetLen); + + @Override + final void status0(long handle) throws RocksDBException { + status0Jni(handle); + } + private static native void status0Jni(long handle) throws RocksDBException; + @Override + final void seekDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen) { + seekDirect0Jni(handle, target, targetOffset, targetLen); + } + private static native void seekDirect0Jni( + long handle, ByteBuffer target, int targetOffset, int targetLen); + @Override + final void seekForPrevDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen) { + seekForPrevDirect0Jni(handle, target, targetOffset, targetLen); + } + private static native void seekForPrevDirect0Jni( long handle, ByteBuffer target, int targetOffset, int targetLen); @Override - final native void seekByteArray0( + final void seekByteArray0( + final long handle, final byte[] target, final int targetOffset, final int targetLen) { + seekByteArray0Jni(handle, target, targetOffset, targetLen); + } + private static native void seekByteArray0Jni( final long handle, final byte[] target, final int targetOffset, final int targetLen); @Override - final native void seekForPrevByteArray0( + final void seekForPrevByteArray0( + final long handle, final byte[] target, final int targetOffset, final int targetLen) { + seekForPrevByteArray0Jni(handle, target, targetOffset, targetLen); + } + private static native void seekForPrevByteArray0Jni( final long handle, final byte[] target, final int targetOffset, final int targetLen); - private native byte[] key0(long handle); - private native byte[] value0(long handle); + private static native byte[] key0(long handle); + private static native byte[] value0(long handle); - private native int keyDirect0(long handle, ByteBuffer buffer, int bufferOffset, int bufferLen); - private native int keyByteArray0(long handle, byte[] buffer, int bufferOffset, int bufferLen); - private native int valueDirect0(long handle, ByteBuffer buffer, int bufferOffset, int bufferLen); - private native int valueByteArray0(long handle, byte[] buffer, int bufferOffset, int bufferLen); + private static native int keyDirect0( + long handle, ByteBuffer buffer, int bufferOffset, int bufferLen); + private static native int keyByteArray0( + long handle, byte[] buffer, int bufferOffset, int bufferLen); + private static native int valueDirect0( + long handle, ByteBuffer buffer, int bufferOffset, int bufferLen); + private static native int valueByteArray0( + long handle, byte[] buffer, int bufferOffset, int bufferLen); } diff --git a/java/src/main/java/org/rocksdb/SstFileWriter.java b/java/src/main/java/org/rocksdb/SstFileWriter.java index d5766bffb61..ff8b574a913 100644 --- a/java/src/main/java/org/rocksdb/SstFileWriter.java +++ b/java/src/main/java/org/rocksdb/SstFileWriter.java @@ -203,33 +203,35 @@ private static native long newSstFileWriter(final long envOptionsHandle, final l private static native long newSstFileWriter( final long envOptionsHandle, final long optionsHandle); - private native void open(final long handle, final String filePath) + private static native void open(final long handle, final String filePath) throws RocksDBException; + + private static native void put(final long handle, final long keyHandle, final long valueHandle) throws RocksDBException; - private native void put(final long handle, final long keyHandle, - final long valueHandle) throws RocksDBException; - - private native void put(final long handle, final byte[] key, - final byte[] value) throws RocksDBException; + private static native void put(final long handle, final byte[] key, final byte[] value) + throws RocksDBException; - private native void putDirect(long handle, ByteBuffer key, int keyOffset, int keyLength, + private static native void putDirect(long handle, ByteBuffer key, int keyOffset, int keyLength, ByteBuffer value, int valueOffset, int valueLength) throws RocksDBException; - private native long fileSize(long handle) throws RocksDBException; - - private native void merge(final long handle, final long keyHandle, - final long valueHandle) throws RocksDBException; + private static native long fileSize(long handle) throws RocksDBException; - private native void merge(final long handle, final byte[] key, - final byte[] value) throws RocksDBException; + private static native void merge(final long handle, final long keyHandle, final long valueHandle) + throws RocksDBException; - private native void delete(final long handle, final long keyHandle) + private static native void merge(final long handle, final byte[] key, final byte[] value) throws RocksDBException; - private native void delete(final long handle, final byte[] key) + private static native void delete(final long handle, final long keyHandle) throws RocksDBException; - private native void finish(final long handle) throws RocksDBException; + private static native void delete(final long handle, final byte[] key) throws RocksDBException; + + private static native void finish(final long handle) throws RocksDBException; - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java b/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java index b1ccf08c140..d9b7184aa01 100644 --- a/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java +++ b/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java @@ -15,5 +15,9 @@ public SstPartitionerFixedPrefixFactory(final long prefixLength) { private static native long newSstPartitionerFixedPrefixFactory0(long prefixLength); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/Statistics.java b/java/src/main/java/org/rocksdb/Statistics.java index 09e08ee5699..80ae2458693 100644 --- a/java/src/main/java/org/rocksdb/Statistics.java +++ b/java/src/main/java/org/rocksdb/Statistics.java @@ -148,14 +148,18 @@ private static long newStatisticsInstance(final byte[] ignoreHistograms) { private static native long newStatistics( final byte[] ignoreHistograms, final long otherStatisticsHandle); - @Override protected final native void disposeInternal(final long handle); - - private native byte statsLevel(final long handle); - private native void setStatsLevel(final long handle, final byte statsLevel); - private native long getTickerCount(final long handle, final byte tickerType); - private native long getAndResetTickerCount(final long handle, final byte tickerType); - private native HistogramData getHistogramData(final long handle, final byte histogramType); - private native String getHistogramString(final long handle, final byte histogramType); - private native void reset(final long nativeHandle) throws RocksDBException; - private native String toString(final long nativeHandle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + + private static native byte statsLevel(final long handle); + private static native void setStatsLevel(final long handle, final byte statsLevel); + private static native long getTickerCount(final long handle, final byte tickerType); + private static native long getAndResetTickerCount(final long handle, final byte tickerType); + private static native HistogramData getHistogramData(final long handle, final byte histogramType); + private static native String getHistogramString(final long handle, final byte histogramType); + private static native void reset(final long nativeHandle) throws RocksDBException; + private static native String toString(final long nativeHandle); } diff --git a/java/src/main/java/org/rocksdb/StringAppendOperator.java b/java/src/main/java/org/rocksdb/StringAppendOperator.java index 547371e7c08..25b134c44af 100644 --- a/java/src/main/java/org/rocksdb/StringAppendOperator.java +++ b/java/src/main/java/org/rocksdb/StringAppendOperator.java @@ -25,5 +25,9 @@ public StringAppendOperator(final String delim) { private static native long newSharedStringAppendOperator(final char delim); private static native long newSharedStringAppendOperator(final String delim); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/TablePropertiesCollectorFactory.java b/java/src/main/java/org/rocksdb/TablePropertiesCollectorFactory.java new file mode 100644 index 00000000000..ae2789ef826 --- /dev/null +++ b/java/src/main/java/org/rocksdb/TablePropertiesCollectorFactory.java @@ -0,0 +1,44 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public abstract class TablePropertiesCollectorFactory extends RocksObject { + private TablePropertiesCollectorFactory(final long nativeHandle) { + super(nativeHandle); + } + + public static TablePropertiesCollectorFactory NewCompactOnDeletionCollectorFactory( + final long sliding_window_size, final long deletion_trigger, final double deletion_ratio) { + long handle = + newCompactOnDeletionCollectorFactory(sliding_window_size, deletion_trigger, deletion_ratio); + return new TablePropertiesCollectorFactory(handle) { + @Override + protected void disposeInternal(long handle) { + TablePropertiesCollectorFactory.deleteCompactOnDeletionCollectorFactory(handle); + } + }; + } + + /** + * Internal API. Do not use. + * @param nativeHandle + * @return + */ + static TablePropertiesCollectorFactory newWrapper(final long nativeHandle) { + return new TablePropertiesCollectorFactory(nativeHandle) { + @Override + protected void disposeInternal(long handle) { + TablePropertiesCollectorFactory.deleteCompactOnDeletionCollectorFactory(handle); + } + }; + } + + private static native long newCompactOnDeletionCollectorFactory( + final long slidingWindowSize, final long deletionTrigger, final double deletionRatio); + + private static native void deleteCompactOnDeletionCollectorFactory(final long handle); +} diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index f2ca42776e7..90f0b6ba2e3 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -65,102 +65,152 @@ public enum TickerType { /** * # of times cache miss when accessing filter block from block cache. */ - BLOCK_CACHE_FILTER_MISS((byte) 0x9), + BLOCK_CACHE_FILTER_MISS((byte) 0x8), /** * # of times cache hit when accessing filter block from block cache. */ - BLOCK_CACHE_FILTER_HIT((byte) 0xA), + BLOCK_CACHE_FILTER_HIT((byte) 0x9), /** * # of filter blocks added to block cache. */ - BLOCK_CACHE_FILTER_ADD((byte) 0xB), + BLOCK_CACHE_FILTER_ADD((byte) 0xA), /** * # of bytes of bloom filter blocks inserted into cache */ - BLOCK_CACHE_FILTER_BYTES_INSERT((byte) 0xC), + BLOCK_CACHE_FILTER_BYTES_INSERT((byte) 0xB), /** * # of times cache miss when accessing data block from block cache. */ - BLOCK_CACHE_DATA_MISS((byte) 0xE), + BLOCK_CACHE_DATA_MISS((byte) 0xC), /** * # of times cache hit when accessing data block from block cache. */ - BLOCK_CACHE_DATA_HIT((byte) 0xF), + BLOCK_CACHE_DATA_HIT((byte) 0xD), /** * # of data blocks added to block cache. */ - BLOCK_CACHE_DATA_ADD((byte) 0x10), + BLOCK_CACHE_DATA_ADD((byte) 0xE), /** * # of bytes of data blocks inserted into cache */ - BLOCK_CACHE_DATA_BYTES_INSERT((byte) 0x11), + BLOCK_CACHE_DATA_BYTES_INSERT((byte) 0xF), /** * # of bytes read from cache. */ - BLOCK_CACHE_BYTES_READ((byte) 0x12), + BLOCK_CACHE_BYTES_READ((byte) 0x10), /** * # of bytes written into cache. */ - BLOCK_CACHE_BYTES_WRITE((byte) 0x13), + BLOCK_CACHE_BYTES_WRITE((byte) 0x11), + + /** + * Block cache related stats for Compression dictionaries + */ + BLOCK_CACHE_COMPRESSION_DICT_MISS((byte) 0x12), + BLOCK_CACHE_COMPRESSION_DICT_HIT((byte) 0x13), + BLOCK_CACHE_COMPRESSION_DICT_ADD((byte) 0x14), + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT((byte) 0x15), + + /** + * Redundant additions to block cache + */ + BLOCK_CACHE_ADD_REDUNDANT((byte) 0x16), + BLOCK_CACHE_INDEX_ADD_REDUNDANT((byte) 0x17), + BLOCK_CACHE_FILTER_ADD_REDUNDANT((byte) 0x18), + BLOCK_CACHE_DATA_ADD_REDUNDANT((byte) 0x19), + BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT((byte) 0x1A), + + /** + * Number of secondary cache hits + */ + SECONDARY_CACHE_HITS((byte) 0x1B), + SECONDARY_CACHE_FILTER_HITS((byte) 0x1C), + SECONDARY_CACHE_INDEX_HITS((byte) 0x1D), + SECONDARY_CACHE_DATA_HITS((byte) 0x1E), + + COMPRESSED_SECONDARY_CACHE_DUMMY_HITS((byte) 0x1F), + COMPRESSED_SECONDARY_CACHE_HITS((byte) 0x20), + COMPRESSED_SECONDARY_CACHE_PROMOTIONS((byte) 0x21), + COMPRESSED_SECONDARY_CACHE_PROMOTION_SKIPS((byte) 0x22), /** * # of times bloom filter has avoided file reads. */ - BLOOM_FILTER_USEFUL((byte) 0x14), + BLOOM_FILTER_USEFUL((byte) 0x23), + + /** + * # of times bloom FullFilter has not avoided the reads. + */ + BLOOM_FILTER_FULL_POSITIVE((byte) 0x24), + + /** + * # of times bloom FullFilter has not avoided the reads and data actually + * exist. + */ + BLOOM_FILTER_FULL_TRUE_POSITIVE((byte) 0x25), + + /** + * Number of times bloom was checked before creating iterator on a + * file, and the number of times the check was useful in avoiding + * iterator creation (and thus likely IOPs). + */ + BLOOM_FILTER_PREFIX_CHECKED((byte) 0x26), + BLOOM_FILTER_PREFIX_USEFUL((byte) 0x27), + BLOOM_FILTER_PREFIX_TRUE_POSITIVE((byte) 0x28), /** * # persistent cache hit */ - PERSISTENT_CACHE_HIT((byte) 0x15), + PERSISTENT_CACHE_HIT((byte) 0x29), /** * # persistent cache miss */ - PERSISTENT_CACHE_MISS((byte) 0x16), + PERSISTENT_CACHE_MISS((byte) 0x2A), /** * # total simulation block cache hits */ - SIM_BLOCK_CACHE_HIT((byte) 0x17), + SIM_BLOCK_CACHE_HIT((byte) 0x2B), /** * # total simulation block cache misses */ - SIM_BLOCK_CACHE_MISS((byte) 0x18), + SIM_BLOCK_CACHE_MISS((byte) 0x2C), /** * # of memtable hits. */ - MEMTABLE_HIT((byte) 0x19), + MEMTABLE_HIT((byte) 0x2D), /** * # of memtable misses. */ - MEMTABLE_MISS((byte) 0x1A), + MEMTABLE_MISS((byte) 0x2E), /** * # of Get() queries served by L0 */ - GET_HIT_L0((byte) 0x1B), + GET_HIT_L0((byte) 0x2F), /** * # of Get() queries served by L1 */ - GET_HIT_L1((byte) 0x1C), + GET_HIT_L1((byte) 0x30), /** * # of Get() queries served by L2 and up */ - GET_HIT_L2_AND_UP((byte) 0x1D), + GET_HIT_L2_AND_UP((byte) 0x31), /** * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction @@ -170,49 +220,59 @@ public enum TickerType { /** * key was written with a newer value. */ - COMPACTION_KEY_DROP_NEWER_ENTRY((byte) 0x1E), + COMPACTION_KEY_DROP_NEWER_ENTRY((byte) 0x32), /** * Also includes keys dropped for range del. * The key is obsolete. */ - COMPACTION_KEY_DROP_OBSOLETE((byte) 0x1F), + COMPACTION_KEY_DROP_OBSOLETE((byte) 0x33), /** * key was covered by a range tombstone. */ - COMPACTION_KEY_DROP_RANGE_DEL((byte) 0x20), + COMPACTION_KEY_DROP_RANGE_DEL((byte) 0x34), /** * User compaction function has dropped the key. */ - COMPACTION_KEY_DROP_USER((byte) 0x21), + COMPACTION_KEY_DROP_USER((byte) 0x35), /** * all keys in range were deleted. */ - COMPACTION_RANGE_DEL_DROP_OBSOLETE((byte) 0x22), + COMPACTION_RANGE_DEL_DROP_OBSOLETE((byte) 0x36), + + /** + * Deletions obsoleted before bottom level due to file gap optimization. + */ + COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE((byte) 0x37), + + /** + * Compactions cancelled to prevent ENOSPC + */ + COMPACTION_CANCELLED((byte) 0x38), /** * Number of keys written to the database via the Put and Write call's. */ - NUMBER_KEYS_WRITTEN((byte) 0x23), + NUMBER_KEYS_WRITTEN((byte) 0x39), /** * Number of Keys read. */ - NUMBER_KEYS_READ((byte) 0x24), + NUMBER_KEYS_READ((byte) 0x3A), /** * Number keys updated, if inplace update is enabled */ - NUMBER_KEYS_UPDATED((byte) 0x25), + NUMBER_KEYS_UPDATED((byte) 0x3B), /** * The number of uncompressed bytes issued by DB::Put(), DB::Delete(),\ * DB::Merge(), and DB::Write(). */ - BYTES_WRITTEN((byte) 0x26), + BYTES_WRITTEN((byte) 0x3C), /** * The number of uncompressed bytes read from DB::Get(). It could be @@ -221,162 +281,193 @@ public enum TickerType { * For the number of logical bytes read from DB::MultiGet(), * please use {@link #NUMBER_MULTIGET_BYTES_READ}. */ - BYTES_READ((byte) 0x27), + BYTES_READ((byte) 0x3D), /** * The number of calls to seek. */ - NUMBER_DB_SEEK((byte) 0x28), + NUMBER_DB_SEEK((byte) 0x3E), /** * The number of calls to next. */ - NUMBER_DB_NEXT((byte) 0x29), + NUMBER_DB_NEXT((byte) 0x3F), /** * The number of calls to prev. */ - NUMBER_DB_PREV((byte) 0x2A), + NUMBER_DB_PREV((byte) 0x40), /** * The number of calls to seek that returned data. */ - NUMBER_DB_SEEK_FOUND((byte) 0x2B), + NUMBER_DB_SEEK_FOUND((byte) 0x41), /** * The number of calls to next that returned data. */ - NUMBER_DB_NEXT_FOUND((byte) 0x2C), + NUMBER_DB_NEXT_FOUND((byte) 0x42), /** * The number of calls to prev that returned data. */ - NUMBER_DB_PREV_FOUND((byte) 0x2D), + NUMBER_DB_PREV_FOUND((byte) 0x43), /** * The number of uncompressed bytes read from an iterator. * Includes size of key and value. */ - ITER_BYTES_READ((byte) 0x2E), + ITER_BYTES_READ((byte) 0x44), - NO_FILE_OPENS((byte) 0x30), + /** + * Number of internal skipped during iteration + */ + NUMBER_ITER_SKIP((byte) 0x45), - NO_FILE_ERRORS((byte) 0x31), + /** + * Number of times we had to reseek inside an iteration to skip + * over large number of keys with same userkey. + */ + NUMBER_OF_RESEEKS_IN_ITERATION((byte) 0x46), + + /** + * Number of iterators created. + */ + NO_ITERATOR_CREATED((byte) 0x47), + + /** + * Number of iterators deleted. + */ + NO_ITERATOR_DELETED((byte) 0x48), + + NO_FILE_OPENS((byte) 0x49), + + NO_FILE_ERRORS((byte) 0x4A), /** * Writer has to wait for compaction or flush to finish. */ - STALL_MICROS((byte) 0x35), + STALL_MICROS((byte) 0x4B), /** * The wait time for db mutex. * * Disabled by default. To enable it set stats level to {@link StatsLevel#ALL} */ - DB_MUTEX_WAIT_MICROS((byte) 0x36), + DB_MUTEX_WAIT_MICROS((byte) 0x4C), /** * Number of MultiGet calls. */ - NUMBER_MULTIGET_CALLS((byte) 0x39), + NUMBER_MULTIGET_CALLS((byte) 0x4D), /** * Number of MultiGet keys read. */ - NUMBER_MULTIGET_KEYS_READ((byte) 0x3A), + NUMBER_MULTIGET_KEYS_READ((byte) 0x4E), /** * Number of MultiGet bytes read. */ - NUMBER_MULTIGET_BYTES_READ((byte) 0x3B), - - NUMBER_MERGE_FAILURES((byte) 0x3D), + NUMBER_MULTIGET_BYTES_READ((byte) 0x4F), /** - * Number of times bloom was checked before creating iterator on a - * file, and the number of times the check was useful in avoiding - * iterator creation (and thus likely IOPs). + * Number of MultiGet keys found (vs number requested) */ - BLOOM_FILTER_PREFIX_CHECKED((byte) 0x3E), - BLOOM_FILTER_PREFIX_USEFUL((byte) 0x3F), + NUMBER_MULTIGET_KEYS_FOUND((byte) 0x50), - /** - * Number of times we had to reseek inside an iteration to skip - * over large number of keys with same userkey. - */ - NUMBER_OF_RESEEKS_IN_ITERATION((byte) 0x40), + NUMBER_MERGE_FAILURES((byte) 0x51), /** * Record the number of calls to {@link RocksDB#getUpdatesSince(long)}. Useful to keep track of * transaction log iterator refreshes. */ - GET_UPDATES_SINCE_CALLS((byte) 0x41), + GET_UPDATES_SINCE_CALLS((byte) 0x52), /** * Number of times WAL sync is done. */ - WAL_FILE_SYNCED((byte) 0x46), + WAL_FILE_SYNCED((byte) 0x53), /** * Number of bytes written to WAL. */ - WAL_FILE_BYTES((byte) 0x47), + WAL_FILE_BYTES((byte) 0x54), /** * Writes can be processed by requesting thread or by the thread at the * head of the writers queue. */ - WRITE_DONE_BY_SELF((byte) 0x48), + WRITE_DONE_BY_SELF((byte) 0x55), /** * Equivalent to writes done for others. */ - WRITE_DONE_BY_OTHER((byte) 0x49), + WRITE_DONE_BY_OTHER((byte) 0x56), /** * Number of Write calls that request WAL. */ - WRITE_WITH_WAL((byte) 0x4B), + WRITE_WITH_WAL((byte) 0x57), /** * Bytes read during compaction. */ - COMPACT_READ_BYTES((byte) 0x4C), + COMPACT_READ_BYTES((byte) 0x58), /** * Bytes written during compaction. */ - COMPACT_WRITE_BYTES((byte) 0x4D), + COMPACT_WRITE_BYTES((byte) 0x59), /** * Bytes written during flush. */ - FLUSH_WRITE_BYTES((byte) 0x4E), + FLUSH_WRITE_BYTES((byte) 0x5A), + + /** + * Compaction read and write statistics broken down by CompactionReason + */ + COMPACT_READ_BYTES_MARKED((byte) 0x5B), + COMPACT_READ_BYTES_PERIODIC((byte) 0x5C), + COMPACT_READ_BYTES_TTL((byte) 0x5D), + COMPACT_WRITE_BYTES_MARKED((byte) 0x5E), + COMPACT_WRITE_BYTES_PERIODIC((byte) 0x5F), + COMPACT_WRITE_BYTES_TTL((byte) 0x60), /** * Number of table's properties loaded directly from file, without creating * table reader object. */ - NUMBER_DIRECT_LOAD_TABLE_PROPERTIES((byte) 0x4F), - NUMBER_SUPERVERSION_ACQUIRES((byte) 0x50), - NUMBER_SUPERVERSION_RELEASES((byte) 0x51), - NUMBER_SUPERVERSION_CLEANUPS((byte) 0x52), + NUMBER_DIRECT_LOAD_TABLE_PROPERTIES((byte) 0x61), + NUMBER_SUPERVERSION_ACQUIRES((byte) 0x62), + NUMBER_SUPERVERSION_RELEASES((byte) 0x63), + NUMBER_SUPERVERSION_CLEANUPS((byte) 0x64), /** * # of compressions/decompressions executed */ - NUMBER_BLOCK_COMPRESSED((byte) 0x53), - NUMBER_BLOCK_DECOMPRESSED((byte) 0x54), + NUMBER_BLOCK_COMPRESSED((byte) 0x65), + NUMBER_BLOCK_DECOMPRESSED((byte) 0x66), - NUMBER_BLOCK_NOT_COMPRESSED((byte) 0x55), - MERGE_OPERATION_TOTAL_TIME((byte) 0x56), - FILTER_OPERATION_TOTAL_TIME((byte) 0x57), + BYTES_COMPRESSED_FROM((byte) 0x67), + BYTES_COMPRESSED_TO((byte) 0x68), + BYTES_COMPRESSION_BYPASSED((byte) 0x69), + BYTES_COMPRESSION_REJECTED((byte) 0x6A), + NUMBER_BLOCK_COMPRESSION_BYPASSED((byte) 0x6B), + NUMBER_BLOCK_COMPRESSION_REJECTED((byte) 0x6C), + BYTES_DECOMPRESSED_FROM((byte) 0x6D), + BYTES_DECOMPRESSED_TO((byte) 0x6E), + + MERGE_OPERATION_TOTAL_TIME((byte) 0x6F), + FILTER_OPERATION_TOTAL_TIME((byte) 0x70), + COMPACTION_CPU_TOTAL_TIME((byte) 0x71), /** * Row cache. */ - ROW_CACHE_HIT((byte) 0x58), - ROW_CACHE_MISS((byte) 0x59), + ROW_CACHE_HIT((byte) 0x72), + ROW_CACHE_MISS((byte) 0x73), /** * Read amplification statistics. @@ -390,271 +481,254 @@ public enum TickerType { /** * Estimate of total bytes actually used. */ - READ_AMP_ESTIMATE_USEFUL_BYTES((byte) 0x5A), + READ_AMP_ESTIMATE_USEFUL_BYTES((byte) 0x74), /** * Total size of loaded data blocks. */ - READ_AMP_TOTAL_READ_BYTES((byte) 0x5B), + READ_AMP_TOTAL_READ_BYTES((byte) 0x75), /** * Number of refill intervals where rate limiter's bytes are fully consumed. */ - NUMBER_RATE_LIMITER_DRAINS((byte) 0x5C), - - /** - * Number of internal skipped during iteration - */ - NUMBER_ITER_SKIP((byte) 0x5D), - - /** - * Number of MultiGet keys found (vs number requested) - */ - NUMBER_MULTIGET_KEYS_FOUND((byte) 0x5E), - - // -0x01 to fixate the new value that incorrectly changed TICKER_ENUM_MAX - /** - * Number of iterators created. - */ - NO_ITERATOR_CREATED((byte) -0x01), - - /** - * Number of iterators deleted. - */ - NO_ITERATOR_DELETED((byte) 0x60), - - /** - * Deletions obsoleted before bottom level due to file gap optimization. - */ - COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE((byte) 0x61), - - /** - * If a compaction was cancelled in sfm to prevent ENOSPC - */ - COMPACTION_CANCELLED((byte) 0x62), - - /** - * # of times bloom FullFilter has not avoided the reads. - */ - BLOOM_FILTER_FULL_POSITIVE((byte) 0x63), - - /** - * # of times bloom FullFilter has not avoided the reads and data actually - * exist. - */ - BLOOM_FILTER_FULL_TRUE_POSITIVE((byte) 0x64), + NUMBER_RATE_LIMITER_DRAINS((byte) 0x76), /** * BlobDB specific stats * # of Put/PutTTL/PutUntil to BlobDB. */ - BLOB_DB_NUM_PUT((byte) 0x65), + BLOB_DB_NUM_PUT((byte) 0x77), /** * # of Write to BlobDB. */ - BLOB_DB_NUM_WRITE((byte) 0x66), + BLOB_DB_NUM_WRITE((byte) 0x78), /** * # of Get to BlobDB. */ - BLOB_DB_NUM_GET((byte) 0x67), + BLOB_DB_NUM_GET((byte) 0x79), /** * # of MultiGet to BlobDB. */ - BLOB_DB_NUM_MULTIGET((byte) 0x68), + BLOB_DB_NUM_MULTIGET((byte) 0x7A), /** * # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator. */ - BLOB_DB_NUM_SEEK((byte) 0x69), + BLOB_DB_NUM_SEEK((byte) 0x7B), /** * # of Next to BlobDB iterator. */ - BLOB_DB_NUM_NEXT((byte) 0x6A), + BLOB_DB_NUM_NEXT((byte) 0x7C), /** * # of Prev to BlobDB iterator. */ - BLOB_DB_NUM_PREV((byte) 0x6B), + BLOB_DB_NUM_PREV((byte) 0x7D), /** * # of keys written to BlobDB. */ - BLOB_DB_NUM_KEYS_WRITTEN((byte) 0x6C), + BLOB_DB_NUM_KEYS_WRITTEN((byte) 0x7E), /** * # of keys read from BlobDB. */ - BLOB_DB_NUM_KEYS_READ((byte) 0x6D), + BLOB_DB_NUM_KEYS_READ((byte) 0x7F), /** * # of bytes (key + value) written to BlobDB. */ - BLOB_DB_BYTES_WRITTEN((byte) 0x6E), + BLOB_DB_BYTES_WRITTEN((byte) -0x1), /** * # of bytes (keys + value) read from BlobDB. */ - BLOB_DB_BYTES_READ((byte) 0x6F), + BLOB_DB_BYTES_READ((byte) -0x2), /** * # of keys written by BlobDB as non-TTL inlined value. */ - BLOB_DB_WRITE_INLINED((byte) 0x70), + BLOB_DB_WRITE_INLINED((byte) -0x3), /** * # of keys written by BlobDB as TTL inlined value. */ - BLOB_DB_WRITE_INLINED_TTL((byte) 0x71), + BLOB_DB_WRITE_INLINED_TTL((byte) -0x4), /** * # of keys written by BlobDB as non-TTL blob value. */ - BLOB_DB_WRITE_BLOB((byte) 0x72), + BLOB_DB_WRITE_BLOB((byte) -0x5), /** * # of keys written by BlobDB as TTL blob value. */ - BLOB_DB_WRITE_BLOB_TTL((byte) 0x73), + BLOB_DB_WRITE_BLOB_TTL((byte) -0x6), /** * # of bytes written to blob file. */ - BLOB_DB_BLOB_FILE_BYTES_WRITTEN((byte) 0x74), + BLOB_DB_BLOB_FILE_BYTES_WRITTEN((byte) -0x7), /** * # of bytes read from blob file. */ - BLOB_DB_BLOB_FILE_BYTES_READ((byte) 0x75), + BLOB_DB_BLOB_FILE_BYTES_READ((byte) -0x8), /** * # of times a blob files being synced. */ - BLOB_DB_BLOB_FILE_SYNCED((byte) 0x76), + BLOB_DB_BLOB_FILE_SYNCED((byte) -0x9), /** * # of blob index evicted from base DB by BlobDB compaction filter because * of expiration. */ - BLOB_DB_BLOB_INDEX_EXPIRED_COUNT((byte) 0x77), + BLOB_DB_BLOB_INDEX_EXPIRED_COUNT((byte) -0xA), /** * Size of blob index evicted from base DB by BlobDB compaction filter * because of expiration. */ - BLOB_DB_BLOB_INDEX_EXPIRED_SIZE((byte) 0x78), + BLOB_DB_BLOB_INDEX_EXPIRED_SIZE((byte) -0xB), /** * # of blob index evicted from base DB by BlobDB compaction filter because * of corresponding file deleted. */ - BLOB_DB_BLOB_INDEX_EVICTED_COUNT((byte) 0x79), + BLOB_DB_BLOB_INDEX_EVICTED_COUNT((byte) -0xC), /** * Size of blob index evicted from base DB by BlobDB compaction filter * because of corresponding file deleted. */ - BLOB_DB_BLOB_INDEX_EVICTED_SIZE((byte) 0x7A), + BLOB_DB_BLOB_INDEX_EVICTED_SIZE((byte) -0xD), /** * # of blob files being garbage collected. */ - BLOB_DB_GC_NUM_FILES((byte) 0x7B), + BLOB_DB_GC_NUM_FILES((byte) -0xE), /** * # of blob files generated by garbage collection. */ - BLOB_DB_GC_NUM_NEW_FILES((byte) 0x7C), + BLOB_DB_GC_NUM_NEW_FILES((byte) -0xF), /** * # of BlobDB garbage collection failures. */ - BLOB_DB_GC_FAILURES((byte) 0x7D), + BLOB_DB_GC_FAILURES((byte) -0x10), /** * # of keys relocated to new blob file by garbage collection. */ - BLOB_DB_GC_NUM_KEYS_RELOCATED((byte) -0x02), + BLOB_DB_GC_NUM_KEYS_RELOCATED((byte) -0x11), /** * # of bytes relocated to new blob file by garbage collection. */ - BLOB_DB_GC_BYTES_RELOCATED((byte) -0x05), + BLOB_DB_GC_BYTES_RELOCATED((byte) -0x12), /** * # of blob files evicted because of BlobDB is full. */ - BLOB_DB_FIFO_NUM_FILES_EVICTED((byte) -0x06), + BLOB_DB_FIFO_NUM_FILES_EVICTED((byte) -0x13), /** * # of keys in the blob files evicted because of BlobDB is full. */ - BLOB_DB_FIFO_NUM_KEYS_EVICTED((byte) -0x07), + BLOB_DB_FIFO_NUM_KEYS_EVICTED((byte) -0x14), /** * # of bytes in the blob files evicted because of BlobDB is full. */ - BLOB_DB_FIFO_BYTES_EVICTED((byte) -0x08), + BLOB_DB_FIFO_BYTES_EVICTED((byte) -0x15), + + /** + * # of times cache miss when accessing blob from blob cache. + */ + BLOB_DB_CACHE_MISS((byte) -0x16), + + /** + * # of times cache hit when accessing blob from blob cache. + */ + BLOB_DB_CACHE_HIT((byte) -0x17), + + /** + * # of data blocks added to blob cache. + */ + BLOB_DB_CACHE_ADD((byte) -0x18), + + /** + * # # of failures when adding blobs to blob cache. + */ + BLOB_DB_CACHE_ADD_FAILURES((byte) -0x19), + + /** + * # of bytes read from blob cache. + */ + BLOB_DB_CACHE_BYTES_READ((byte) -0x1A), + + /** + * # of bytes written into blob cache. + */ + BLOB_DB_CACHE_BYTES_WRITE((byte) -0x1B), /** * These counters indicate a performance issue in WritePrepared transactions. * We should not seem them ticking them much. * # of times prepare_mutex_ is acquired in the fast path. */ - TXN_PREPARE_MUTEX_OVERHEAD((byte) -0x09), + TXN_PREPARE_MUTEX_OVERHEAD((byte) -0x1C), /** * # of times old_commit_map_mutex_ is acquired in the fast path. */ - TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD((byte) -0x0A), + TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD((byte) -0x1D), /** * # of times we checked a batch for duplicate keys. */ - TXN_DUPLICATE_KEY_OVERHEAD((byte) -0x0B), + TXN_DUPLICATE_KEY_OVERHEAD((byte) -0x1E), /** * # of times snapshot_mutex_ is acquired in the fast path. */ - TXN_SNAPSHOT_MUTEX_OVERHEAD((byte) -0x0C), + TXN_SNAPSHOT_MUTEX_OVERHEAD((byte) -0x1F), /** * # of times ::Get returned TryAgain due to expired snapshot seq */ - TXN_GET_TRY_AGAIN((byte) -0x0D), + TXN_GET_TRY_AGAIN((byte) -0x20), /** * # of files marked as trash by delete scheduler */ - FILES_MARKED_TRASH((byte) -0x0E), + FILES_MARKED_TRASH((byte) -0x21), /** - * # of files deleted immediately by delete scheduler + * # of trash files deleted by the background thread from the trash queue */ - FILES_DELETED_IMMEDIATELY((byte) -0x0f), + FILES_DELETED_FROM_TRASH_QUEUE((byte) -0x22), /** - * Compaction read and write statistics broken down by CompactionReason + * # of files deleted immediately by delete scheduler */ - COMPACT_READ_BYTES_MARKED((byte) -0x10), - COMPACT_READ_BYTES_PERIODIC((byte) -0x11), - COMPACT_READ_BYTES_TTL((byte) -0x12), - COMPACT_WRITE_BYTES_MARKED((byte) -0x13), - COMPACT_WRITE_BYTES_PERIODIC((byte) -0x14), - COMPACT_WRITE_BYTES_TTL((byte) -0x15), + FILES_DELETED_IMMEDIATELY((byte) -0x23), /** * DB error handler statistics */ - ERROR_HANDLER_BG_ERROR_COUNT((byte) -0x16), - ERROR_HANDLER_BG_IO_ERROR_COUNT((byte) -0x17), - ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT((byte) -0x18), - ERROR_HANDLER_AUTORESUME_COUNT((byte) -0x19), - ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT((byte) -0x1A), - ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT((byte) -0x1B), + ERROR_HANDLER_BG_ERROR_COUNT((byte) -0x24), + ERROR_HANDLER_BG_IO_ERROR_COUNT((byte) -0x25), + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT((byte) -0x26), + ERROR_HANDLER_AUTORESUME_COUNT((byte) -0x27), + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT((byte) -0x28), + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT((byte) -0x29), /** * Bytes of raw data (payload) found on memtable at flush time. @@ -662,121 +736,149 @@ public enum TickerType { * at flush time) and useful payload (bytes of data that will * eventually be written to SSTable). */ - MEMTABLE_PAYLOAD_BYTES_AT_FLUSH((byte) -0x1C), + MEMTABLE_PAYLOAD_BYTES_AT_FLUSH((byte) -0x2A), /** * Outdated bytes of data present on memtable at flush time. */ - MEMTABLE_GARBAGE_BYTES_AT_FLUSH((byte) -0x1D), - - /** - * Number of secondary cache hits - */ - SECONDARY_CACHE_HITS((byte) -0x1E), + MEMTABLE_GARBAGE_BYTES_AT_FLUSH((byte) -0x2B), /** * Bytes read by `VerifyChecksum()` and `VerifyFileChecksums()` APIs. */ - VERIFY_CHECKSUM_READ_BYTES((byte) -0x1F), + VERIFY_CHECKSUM_READ_BYTES((byte) -0x2C), /** * Bytes read/written while creating backups */ - BACKUP_READ_BYTES((byte) -0x20), - BACKUP_WRITE_BYTES((byte) -0x21), + BACKUP_READ_BYTES((byte) -0x2D), + BACKUP_WRITE_BYTES((byte) -0x2E), /** * Remote compaction read/write statistics */ - REMOTE_COMPACT_READ_BYTES((byte) -0x22), - REMOTE_COMPACT_WRITE_BYTES((byte) -0x23), + REMOTE_COMPACT_READ_BYTES((byte) -0x2F), + REMOTE_COMPACT_WRITE_BYTES((byte) -0x30), /** * Tiered storage related statistics */ - HOT_FILE_READ_BYTES((byte) -0x24), - WARM_FILE_READ_BYTES((byte) -0x25), - COLD_FILE_READ_BYTES((byte) -0x26), - HOT_FILE_READ_COUNT((byte) -0x27), - WARM_FILE_READ_COUNT((byte) -0x28), - COLD_FILE_READ_COUNT((byte) -0x29), + HOT_FILE_READ_BYTES((byte) -0x31), + WARM_FILE_READ_BYTES((byte) -0x32), + COLD_FILE_READ_BYTES((byte) -0x33), + HOT_FILE_READ_COUNT((byte) -0x34), + WARM_FILE_READ_COUNT((byte) -0x35), + COLD_FILE_READ_COUNT((byte) -0x36), /** * (non-)last level read statistics */ - LAST_LEVEL_READ_BYTES((byte) -0x2A), - LAST_LEVEL_READ_COUNT((byte) -0x2B), - NON_LAST_LEVEL_READ_BYTES((byte) -0x2C), - NON_LAST_LEVEL_READ_COUNT((byte) -0x2D), + LAST_LEVEL_READ_BYTES((byte) -0x37), + LAST_LEVEL_READ_COUNT((byte) -0x38), + NON_LAST_LEVEL_READ_BYTES((byte) -0x39), + NON_LAST_LEVEL_READ_COUNT((byte) -0x3A), /** - * Number of block checksum verifications + * Statistics on iterator Seek() (and variants) for each sorted run. + * i.e a single user Seek() can result in many sorted run Seek()s. + * The stats are split between last level and non-last level. + * Filtered: a filter such as prefix Bloom filter indicate the Seek() would + * not find anything relevant, so avoided a likely access to data+index + * blocks. */ - BLOCK_CHECKSUM_COMPUTE_COUNT((byte) -0x2E), - + LAST_LEVEL_SEEK_FILTERED((byte) -0x3B), /** - * # of times cache miss when accessing blob from blob cache. + * Filter match: a filter such as prefix Bloom filter was queried but did + * not filter out the seek. + */ + LAST_LEVEL_SEEK_FILTER_MATCH((byte) -0x3C), + /** + * At least one data block was accessed for a Seek() (or variant) on a + * sorted run. + */ + LAST_LEVEL_SEEK_DATA((byte) -0x3D), + /** + * At least one value() was accessed for the seek (suggesting it was useful), + * and no filter such as prefix Bloom was queried. */ - BLOB_DB_CACHE_MISS((byte) -0x2F), + LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER((byte) -0x3E), + /** + * At least one value() was accessed for the seek (suggesting it was useful), + * after querying a filter such as prefix Bloom. + */ + LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH((byte) -0x3F), /** - * # of times cache hit when accessing blob from blob cache. + * The same set of stats, but for non-last level seeks. */ - BLOB_DB_CACHE_HIT((byte) -0x30), + NON_LAST_LEVEL_SEEK_FILTERED((byte) -0x40), + NON_LAST_LEVEL_SEEK_FILTER_MATCH((byte) -0x41), + NON_LAST_LEVEL_SEEK_DATA((byte) -0x42), + NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER((byte) -0x43), + NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH((byte) -0x44), /** - * # of data blocks added to blob cache. + * Number of block checksum verifications */ - BLOB_DB_CACHE_ADD((byte) -0x31), + BLOCK_CHECKSUM_COMPUTE_COUNT((byte) -0x45), /** - * # # of failures when adding blobs to blob cache. + * Number of times RocksDB detected a corruption while verifying a block + * checksum. RocksDB does not remember corruptions that happened during user + * reads so the same block corruption may be detected multiple times. */ - BLOB_DB_CACHE_ADD_FAILURES((byte) -0x32), + BLOCK_CHECKSUM_MISMATCH_COUNT((byte) -0x46), + + MULTIGET_COROUTINE_COUNT((byte) -0x47), /** - * # of bytes read from blob cache. + * Time spent in the ReadAsync file system call */ - BLOB_DB_CACHE_BYTES_READ((byte) -0x33), + READ_ASYNC_MICROS((byte) -0x48), /** - * # of bytes written into blob cache. + * Number of errors returned to the async read callback */ - BLOB_DB_CACHE_BYTES_WRITE((byte) -0x34), + ASYNC_READ_ERROR_COUNT((byte) -0x49), /** * Number of lookup into the prefetched tail (see * `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) * that can't find its data for table open */ - TABLE_OPEN_PREFETCH_TAIL_MISS((byte) -0x3A), + TABLE_OPEN_PREFETCH_TAIL_MISS((byte) -0x4A), /** * Number of lookup into the prefetched tail (see * `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) * that finds its data for table open */ - TABLE_OPEN_PREFETCH_TAIL_HIT((byte) -0x3B), + TABLE_OPEN_PREFETCH_TAIL_HIT((byte) -0x4B), /** - * Number of times RocksDB detected a corruption while verifying a block - * checksum. RocksDB does not remember corruptions that happened during user - * reads so the same block corruption may be detected multiple times. + * # of times timestamps are checked on accessing the table */ - BLOCK_CHECKSUM_MISMATCH_COUNT((byte) -0x3C), + TIMESTAMP_FILTER_TABLE_CHECKED((byte) -0x4C), + + /** + * # of times timestamps can successfully help skip the table access + */ + TIMESTAMP_FILTER_TABLE_FILTERED((byte) -0x4D), + + READAHEAD_TRIMMED((byte) -0x4E), - READAHEAD_TRIMMED((byte) -0x3D), + FIFO_MAX_SIZE_COMPACTIONS((byte) -0x4F), - FIFO_MAX_SIZE_COMPACTIONS((byte) -0x3E), + FIFO_TTL_COMPACTIONS((byte) -0x50), - FIFO_TTL_COMPACTIONS((byte) -0x3F), + PREFETCH_BYTES((byte) -0x51), - PREFETCH_BYTES((byte) -0x40), + PREFETCH_BYTES_USEFUL((byte) -0x52), - PREFETCH_BYTES_USEFUL((byte) -0x41), + PREFETCH_HITS((byte) -0x53), - PREFETCH_HITS((byte) -0x42), + SST_FOOTER_CORRUPTION_COUNT((byte) -0x55), - TICKER_ENUM_MAX((byte) 0x5F); + TICKER_ENUM_MAX((byte) -0x54); private final byte value; diff --git a/java/src/main/java/org/rocksdb/TimedEnv.java b/java/src/main/java/org/rocksdb/TimedEnv.java index dc8b5d6efb9..55cd3725131 100644 --- a/java/src/main/java/org/rocksdb/TimedEnv.java +++ b/java/src/main/java/org/rocksdb/TimedEnv.java @@ -26,5 +26,9 @@ public TimedEnv(final Env baseEnv) { } private static native long createTimedEnv(final long baseEnvHandle); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/Transaction.java b/java/src/main/java/org/rocksdb/Transaction.java index 8ab968a3c6e..d1ddcbcbe6c 100644 --- a/java/src/main/java/org/rocksdb/Transaction.java +++ b/java/src/main/java/org/rocksdb/Transaction.java @@ -7,6 +7,7 @@ import static org.rocksdb.RocksDB.PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -31,7 +32,10 @@ public class Transaction extends RocksObject { private static final String FOR_EACH_KEY_THERE_MUST_BE_A_COLUMNFAMILYHANDLE = "For each key there must be a ColumnFamilyHandle."; + private static final String BB_ALL_DIRECT_OR_INDIRECT = + "ByteBuffer parameters must all be direct, or must all be indirect"; private final RocksDB parent; + private final ColumnFamilyHandle defaultColumnFamilyHandle; /** * Intentionally package private @@ -47,6 +51,7 @@ public class Transaction extends RocksObject { Transaction(final RocksDB parent, final long transactionHandle) { super(transactionHandle); this.parent = parent; + this.defaultColumnFamilyHandle = parent.getDefaultColumnFamily(); } /** @@ -250,6 +255,9 @@ public void rollbackToSavePoint() throws RocksDBException { } /** + * This function has an inconsistent parameter order compared to other {@code get()} + * methods and is deprecated in favour of one with a consistent order. + * * This function is similar to * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])} except it will * also read pending changes in this transaction. @@ -275,10 +283,44 @@ public void rollbackToSavePoint() throws RocksDBException { * @throws RocksDBException thrown if error happens in underlying native * library. */ - public byte[] get(final ColumnFamilyHandle columnFamilyHandle, - final ReadOptions readOptions, final byte[] key) throws RocksDBException { - assert(isOwningHandle()); - return get(nativeHandle_, readOptions.nativeHandle_, key, key.length, + @Deprecated + public byte[] get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions readOptions, + final byte[] key) throws RocksDBException { + assert (isOwningHandle()); + return get(nativeHandle_, readOptions.nativeHandle_, key, 0, key.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * This function is similar to + * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])} except it will + * also read pending changes in this transaction. + * Currently, this function will return Status::MergeInProgress if the most + * recent write to the queried key in this batch is a Merge. + * + * If {@link ReadOptions#snapshot()} is not set, the current version of the + * key will be read. Calling {@link #setSnapshot()} does not affect the + * version of the data returned. + * + * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect + * what is read from the DB but will NOT change which keys are read from this + * transaction (the keys in this transaction do not yet belong to any snapshot + * and will be fetched regardless). + * + * @param readOptions Read options. + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance + * @param key the key to retrieve the value for. + * + * @return a byte array storing the value associated with the input key if + * any. null if it does not find the specified key. + * + * @throws RocksDBException thrown if error happens in underlying native + * library. + */ + public byte[] get(final ReadOptions readOptions, final ColumnFamilyHandle columnFamilyHandle, + final byte[] key) throws RocksDBException { + assert (isOwningHandle()); + return get(nativeHandle_, readOptions.nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_); } @@ -310,7 +352,141 @@ public byte[] get(final ColumnFamilyHandle columnFamilyHandle, public byte[] get(final ReadOptions readOptions, final byte[] key) throws RocksDBException { assert(isOwningHandle()); - return get(nativeHandle_, readOptions.nativeHandle_, key, key.length); + return get(nativeHandle_, readOptions.nativeHandle_, key, 0, key.length, + defaultColumnFamilyHandle.nativeHandle_); + } + + /** + * Get the value associated with the specified key in the default column family + * + * @param opt {@link org.rocksdb.ReadOptions} instance. + * @param key the key to retrieve the value. + * @param value the out-value to receive the retrieved value. + * @return A {@link GetStatus} wrapping the result status and the return value size. + * If {@code GetStatus.status} is {@code Ok} then {@code GetStatus.requiredSize} contains + * the size of the actual value that matches the specified + * {@code key} in byte. If {@code GetStatus.requiredSize} is greater than the + * length of {@code value}, then it indicates that the size of the + * input buffer {@code value} is insufficient and a partial result was + * returned. If {@code GetStatus.status} is {@code NotFound} this indicates that + * the value was not found. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public GetStatus get(final ReadOptions opt, final byte[] key, final byte[] value) + throws RocksDBException { + final int result = get(nativeHandle_, opt.nativeHandle_, key, 0, key.length, value, 0, + value.length, defaultColumnFamilyHandle.nativeHandle_); + if (result < 0) { + return GetStatus.fromStatusCode(Status.Code.NotFound, 0); + } else { + return GetStatus.fromStatusCode(Status.Code.Ok, result); + } + } + + /** + * Get the value associated with the specified key in a specified column family + * + * @param opt {@link org.rocksdb.ReadOptions} instance. + * @param columnFamilyHandle the column family to find the key in + * @param key the key to retrieve the value. + * @param value the out-value to receive the retrieved value. + * @return A {@link GetStatus} wrapping the result status and the return value size. + * If {@code GetStatus.status} is {@code Ok} then {@code GetStatus.requiredSize} contains + * the size of the actual value that matches the specified + * {@code key} in byte. If {@code GetStatus.requiredSize} is greater than the + * length of {@code value}, then it indicates that the size of the + * input buffer {@code value} is insufficient and a partial result was + * returned. If {@code GetStatus.status} is {@code NotFound} this indicates that + * the value was not found. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public GetStatus get(final ReadOptions opt, final ColumnFamilyHandle columnFamilyHandle, + final byte[] key, final byte[] value) throws RocksDBException { + final int result = get(nativeHandle_, opt.nativeHandle_, key, 0, key.length, value, 0, + value.length, columnFamilyHandle.nativeHandle_); + if (result < 0) { + return GetStatus.fromStatusCode(Status.Code.NotFound, 0); + } else { + return GetStatus.fromStatusCode(Status.Code.Ok, result); + } + } + + /** + * Get the value associated with the specified key within the specified column family. + * + * @param opt {@link org.rocksdb.ReadOptions} instance. + * @param columnFamilyHandle the column family in which to find the key. + * @param key the key to retrieve the value. It is using position and limit. + * Supports direct buffer only. + * @param value the out-value to receive the retrieved value. + * It is using position and limit. Limit is set according to value size. + * Supports direct buffer only. + * @return A {@link GetStatus} wrapping the result status and the return value size. + * If {@code GetStatus.status} is {@code Ok} then {@code GetStatus.requiredSize} contains + * the size of the actual value that matches the specified + * {@code key} in byte. If {@code GetStatus.requiredSize} is greater than the + * length of {@code value}, then it indicates that the size of the + * input buffer {@code value} is insufficient and a partial result was + * returned. If {@code GetStatus.status} is {@code NotFound} this indicates that + * the value was not found. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public GetStatus get(final ReadOptions opt, final ColumnFamilyHandle columnFamilyHandle, + final ByteBuffer key, final ByteBuffer value) throws RocksDBException { + final int result; + if (key.isDirect() && value.isDirect()) { + result = getDirect(nativeHandle_, opt.nativeHandle_, key, key.position(), key.remaining(), + value, value.position(), value.remaining(), columnFamilyHandle.nativeHandle_); + } else if (!key.isDirect() && !value.isDirect()) { + assert key.hasArray(); + assert value.hasArray(); + result = + get(nativeHandle_, opt.nativeHandle_, key.array(), key.arrayOffset() + key.position(), + key.remaining(), value.array(), value.arrayOffset() + value.position(), + value.remaining(), columnFamilyHandle.nativeHandle_); + } else { + throw new RocksDBException(BB_ALL_DIRECT_OR_INDIRECT); + } + + key.position(key.limit()); + if (result < 0) { + return GetStatus.fromStatusCode(Status.Code.NotFound, 0); + } else { + value.position(Math.min(value.limit(), value.position() + result)); + return GetStatus.fromStatusCode(Status.Code.Ok, result); + } + } + + /** + * Get the value associated with the specified key within the default column family. + * + * @param opt {@link org.rocksdb.ReadOptions} instance. + * @param key the key to retrieve the value. It is using position and limit. + * Supports direct buffer only. + * @param value the out-value to receive the retrieved value. + * It is using position and limit. Limit is set according to value size. + * Supports direct buffer only. + * @return A {@link GetStatus} wrapping the result status and the return value size. + * If {@code GetStatus.status} is {@code Ok} then {@code GetStatus.requiredSize} contains + * the size of the actual value that matches the specified + * {@code key} in byte. If {@code GetStatus.requiredSize} is greater than the + * length of {@code value}, then it indicates that the size of the + * input buffer {@code value} is insufficient and a partial result was + * returned. If {@code GetStatus.status} is {@code NotFound} this indicates that + * the value was not found. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public GetStatus get(final ReadOptions opt, final ByteBuffer key, final ByteBuffer value) + throws RocksDBException { + return get(opt, this.defaultColumnFamilyHandle, key, value); } /** @@ -536,7 +712,7 @@ public byte[] getForUpdate(final ReadOptions readOptions, final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final boolean exclusive, final boolean doValidate) throws RocksDBException { assert (isOwningHandle()); - return getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, key.length, + return getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_, exclusive, doValidate); } @@ -562,7 +738,7 @@ public byte[] getForUpdate(final ReadOptions readOptions, final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final boolean exclusive) throws RocksDBException { assert(isOwningHandle()); - return getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, key.length, + return getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_, exclusive, true /*doValidate*/); } @@ -613,8 +789,369 @@ public byte[] getForUpdate(final ReadOptions readOptions, public byte[] getForUpdate(final ReadOptions readOptions, final byte[] key, final boolean exclusive) throws RocksDBException { assert(isOwningHandle()); + return getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, 0, key.length, + defaultColumnFamilyHandle.nativeHandle_, exclusive, true /*doValidate*/); + } + + /** + * Read this key and ensure that this transaction will only + * be able to be committed if this key is not written outside this + * transaction after it has first been read (or after the snapshot if a + * snapshot is set in this transaction). The transaction behavior is the + * same regardless of whether the key exists or not. + *

+ * Note: Currently, this function will return Status::MergeInProgress + * if the most recent write to the queried key in this batch is a Merge. + *

+ * The values returned by this function are similar to + * {@link RocksDB#get(ReadOptions, byte[])}. + * If value==nullptr, then this function will not read any data, but will + * still ensure that this key cannot be written to by outside of this + * transaction. + *

+ * If this transaction was created on an {@link OptimisticTransactionDB}, + * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)} + * could cause {@link #commit()} to fail. Otherwise, it could return any error + * that could be returned by + * {@link RocksDB#get(ReadOptions, byte[])}. + *

+ * If this transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * {@link Status.Code#MergeInProgress} if merge operations cannot be + * resolved. + * + * @param readOptions Read options. + * @param key the key to retrieve the value for. + * @param value the value associated with the input key if + * * any. The result is undefined in no value is associated with the key + * @param exclusive true if the transaction should have exclusive access to + * the key, otherwise false for shared access. + * + * @return a status object containing + * Status.OK if the requested value was read + * Status.NotFound if the requested value does not exist + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public GetStatus getForUpdate(final ReadOptions readOptions, final byte[] key, final byte[] value, + final boolean exclusive) throws RocksDBException { + final int result = getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, 0, key.length, + value, 0, value.length, defaultColumnFamilyHandle.nativeHandle_, exclusive, + true /* doValidate */); + if (result < 0) { + return GetStatus.fromStatusCode(Status.Code.NotFound, 0); + } else { + return GetStatus.fromStatusCode(Status.Code.Ok, result); + } + } + + /** + * Read this key and ensure that this transaction will only + * be able to be committed if this key is not written outside this + * transaction after it has first been read (or after the snapshot if a + * snapshot is set in this transaction). The transaction behavior is the + * same regardless of whether the key exists or not. + *

+ * Note: Currently, this function will return Status::MergeInProgress + * if the most recent write to the queried key in this batch is a Merge. + *

+ * The values returned by this function are similar to + * {@link RocksDB#get(ReadOptions, byte[])}. + * If value==nullptr, then this function will not read any data, but will + * still ensure that this key cannot be written to by outside of this + * transaction. + *

+ * If this transaction was created on an {@link OptimisticTransactionDB}, + * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)} + * could cause {@link #commit()} to fail. Otherwise, it could return any error + * that could be returned by + * {@link RocksDB#get(ReadOptions, byte[])}. + *

+ * If this transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * {@link Status.Code#MergeInProgress} if merge operations cannot be + * resolved. + * + * @param readOptions Read options. + * @param key the key to retrieve the value for. + * @param value the value associated with the input key if + * * any. The result is undefined in no value is associated with the key + * @param exclusive true if the transaction should have exclusive access to + * the key, otherwise false for shared access. + * + * @return a status object containing + * Status.OK if the requested value was read + * Status.NotFound if the requested value does not exist + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public GetStatus getForUpdate(final ReadOptions readOptions, final ByteBuffer key, + final ByteBuffer value, final boolean exclusive) throws RocksDBException { + return getForUpdate( + readOptions, defaultColumnFamilyHandle, key, value, exclusive, true /* doValidate */); + } + + /** + * Read this key and ensure that this transaction will only + * be able to be committed if this key is not written outside this + * transaction after it has first been read (or after the snapshot if a + * snapshot is set in this transaction). The transaction behavior is the + * same regardless of whether the key exists or not. + *

+ * Note: Currently, this function will return Status::MergeInProgress + * if the most recent write to the queried key in this batch is a Merge. + *

+ * The values returned by this function are similar to + * {@link RocksDB#get(ReadOptions, byte[])}. + * If value==nullptr, then this function will not read any data, but will + * still ensure that this key cannot be written to by outside of this + * transaction. + *

+ * If this transaction was created on an {@link OptimisticTransactionDB}, + * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)} + * could cause {@link #commit()} to fail. Otherwise, it could return any error + * that could be returned by + * {@link RocksDB#get(ReadOptions, byte[])}. + *

+ * If this transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * {@link Status.Code#MergeInProgress} if merge operations cannot be + * resolved. + * + * @param readOptions Read options. + * @param columnFamilyHandle in which to find the key/value + * @param key the key to retrieve the value for. + * @param value the value associated with the input key if + * * any. The result is undefined in no value is associated with the key + * @param exclusive true if the transaction should have exclusive access to + * the key, otherwise false for shared access. + * + * @return a status object containing + * Status.OK if the requested value was read + * Status.NotFound if the requested value does not exist + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public GetStatus getForUpdate(final ReadOptions readOptions, + final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value, + final boolean exclusive) throws RocksDBException { return getForUpdate( - nativeHandle_, readOptions.nativeHandle_, key, key.length, exclusive, true /*doValidate*/); + readOptions, columnFamilyHandle, key, value, exclusive, true /*doValidate*/); + } + + /** + * Read this key and ensure that this transaction will only + * be able to be committed if this key is not written outside this + * transaction after it has first been read (or after the snapshot if a + * snapshot is set in this transaction). The transaction behavior is the + * same regardless of whether the key exists or not. + *

+ * Note: Currently, this function will return Status::MergeInProgress + * if the most recent write to the queried key in this batch is a Merge. + *

+ * The values returned by this function are similar to + * {@link RocksDB#get(ReadOptions, byte[])}. + * If value==nullptr, then this function will not read any data, but will + * still ensure that this key cannot be written to by outside of this + * transaction. + *

+ * If this transaction was created on an {@link OptimisticTransactionDB}, + * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)} + * could cause {@link #commit()} to fail. Otherwise, it could return any error + * that could be returned by + * {@link RocksDB#get(ReadOptions, byte[])}. + *

+ * If this transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * {@link Status.Code#MergeInProgress} if merge operations cannot be + * resolved. + * + * @param readOptions Read options. + * @param columnFamilyHandle in which to find the key/value + * @param key the key to retrieve the value for. + * @param value the value associated with the input key if + * * any. The result is undefined in no value is associated with the key + * @param exclusive true if the transaction should have exclusive access to + * the key, otherwise false for shared access. + * @param doValidate true if the transaction should validate the snapshot before doing the read + * + * @return a status object containing + * Status.OK if the requested value was read + * Status.NotFound if the requested value does not exist + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + + public GetStatus getForUpdate(final ReadOptions readOptions, + final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value, + final boolean exclusive, final boolean doValidate) throws RocksDBException { + final int result = getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, 0, key.length, + value, 0, value.length, columnFamilyHandle.nativeHandle_, exclusive, doValidate); + if (result < 0) { + return GetStatus.fromStatusCode(Status.Code.NotFound, 0); + } else { + return GetStatus.fromStatusCode(Status.Code.Ok, result); + } + } + + /** + * Read this key and ensure that this transaction will only + * be able to be committed if this key is not written outside this + * transaction after it has first been read (or after the snapshot if a + * snapshot is set in this transaction). The transaction behavior is the + * same regardless of whether the key exists or not. + *

+ * Note: Currently, this function will return Status::MergeInProgress + * if the most recent write to the queried key in this batch is a Merge. + *

+ * The values returned by this function are similar to + * {@link RocksDB#get(ReadOptions, byte[])}. + * If value==nullptr, then this function will not read any data, but will + * still ensure that this key cannot be written to by outside of this + * transaction. + *

+ * If this transaction was created on an {@link OptimisticTransactionDB}, + * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)} + * could cause {@link #commit()} to fail. Otherwise, it could return any error + * that could be returned by + * {@link RocksDB#get(ReadOptions, byte[])}. + *

+ * If this transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * {@link Status.Code#MergeInProgress} if merge operations cannot be + * resolved. + * + * @param readOptions Read options. + * @param columnFamilyHandle in which to find the key/value + * @param key the key to retrieve the value for. + * @param value the value associated with the input key if + * * any. The result is undefined in no value is associated with the key + * @param exclusive true if the transaction should have exclusive access to + * the key, otherwise false for shared access. + * + * @return a status object containing + * Status.OK if the requested value was read + * Status.NotFound if the requested value does not exist + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + + public GetStatus getForUpdate(final ReadOptions readOptions, + final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, final ByteBuffer value, + final boolean exclusive) throws RocksDBException { + return getForUpdate( + readOptions, columnFamilyHandle, key, value, exclusive, true /*doValidate*/); + } + + /** + * Read this key and ensure that this transaction will only + * be able to be committed if this key is not written outside this + * transaction after it has first been read (or after the snapshot if a + * snapshot is set in this transaction). The transaction behavior is the + * same regardless of whether the key exists or not. + *

+ * Note: Currently, this function will return Status::MergeInProgress + * if the most recent write to the queried key in this batch is a Merge. + *

+ * The values returned by this function are similar to + * {@link RocksDB#get(ReadOptions, byte[])}. + * If value==nullptr, then this function will not read any data, but will + * still ensure that this key cannot be written to by outside of this + * transaction. + *

+ * If this transaction was created on an {@link OptimisticTransactionDB}, + * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)} + * could cause {@link #commit()} to fail. Otherwise, it could return any error + * that could be returned by + * {@link RocksDB#get(ReadOptions, byte[])}. + *

+ * If this transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * {@link Status.Code#MergeInProgress} if merge operations cannot be + * resolved. + * + * @param readOptions Read options. + * @param columnFamilyHandle in which to find the key/value + * @param key the key to retrieve the value for. + * @param value the value associated with the input key if + * * any. The result is undefined in no value is associated with the key + * @param exclusive true if the transaction should have exclusive access to + * the key, otherwise false for shared access. + * @param doValidate true if the transaction should validate the snapshot before doing the read + * + * @return a status object containing + * Status.OK if the requested value was read + * Status.NotFound if the requested value does not exist + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public GetStatus getForUpdate(final ReadOptions readOptions, + final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, final ByteBuffer value, + final boolean exclusive, final boolean doValidate) throws RocksDBException { + final int result; + if (key.isDirect() && value.isDirect()) { + result = getDirectForUpdate(nativeHandle_, readOptions.nativeHandle_, key, key.position(), + key.remaining(), value, value.position(), value.remaining(), + columnFamilyHandle.nativeHandle_, exclusive, doValidate); + } else if (!key.isDirect() && !value.isDirect()) { + assert key.hasArray(); + assert value.hasArray(); + result = getForUpdate(nativeHandle_, readOptions.nativeHandle_, key.array(), + key.arrayOffset() + key.position(), key.remaining(), value.array(), + value.arrayOffset() + value.position(), value.remaining(), + columnFamilyHandle.nativeHandle_, exclusive, doValidate); + } else { + throw new RocksDBException(BB_ALL_DIRECT_OR_INDIRECT); + } + key.position(key.limit()); + if (result < 0) { + return GetStatus.fromStatusCode(Status.Code.NotFound, 0); + } else { + value.position(Math.min(value.limit(), value.position() + result)); + return GetStatus.fromStatusCode(Status.Code.Ok, result); + } } /** @@ -743,6 +1280,27 @@ public List multiGetForUpdateAsList( * column family including both keys in the DB and uncommitted keys in this * transaction. *

+ * Caller is responsible for deleting the returned Iterator. + *

+ * The returned iterator is only valid until {@link #commit()}, + * {@link #rollback()}, or {@link #rollbackToSavePoint()} is called. + * + * @return instance of iterator object. + */ + public RocksIterator getIterator() { + assert (isOwningHandle()); + try (ReadOptions readOptions = new ReadOptions()) { + return new RocksIterator(parent, + getIterator( + nativeHandle_, readOptions.nativeHandle_, defaultColumnFamilyHandle.nativeHandle_)); + } + } + + /** + * Returns an iterator that will iterate on all keys in the default + * column family including both keys in the DB and uncommitted keys in this + * transaction. + * * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read * from the DB but will NOT change which keys are read from this transaction * (the keys in this transaction do not yet belong to any snapshot and will be @@ -759,8 +1317,9 @@ public List multiGetForUpdateAsList( */ public RocksIterator getIterator(final ReadOptions readOptions) { assert(isOwningHandle()); - return new RocksIterator(parent, getIterator(nativeHandle_, - readOptions.nativeHandle_)); + return new RocksIterator(parent, + getIterator( + nativeHandle_, readOptions.nativeHandle_, defaultColumnFamilyHandle.nativeHandle_)); } /** @@ -792,6 +1351,35 @@ public RocksIterator getIterator(final ReadOptions readOptions, readOptions.nativeHandle_, columnFamilyHandle.nativeHandle_)); } + /** + * Returns an iterator that will iterate on all keys in the column family + * specified by {@code columnFamilyHandle} including both keys in the DB + * and uncommitted keys in this transaction. + *

+ * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read + * from the DB but will NOT change which keys are read from this transaction + * (the keys in this transaction do not yet belong to any snapshot and will be + * fetched regardless). + *

+ * Caller is responsible for calling {@link RocksIterator#close()} on + * the returned Iterator. + *

+ * The returned iterator is only valid until {@link #commit()}, + * {@link #rollback()}, or {@link #rollbackToSavePoint()} is called. + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * + * @return instance of iterator object. + */ + public RocksIterator getIterator(final ColumnFamilyHandle columnFamilyHandle) { + assert (isOwningHandle()); + try (ReadOptions readOptions = new ReadOptions()) { + return new RocksIterator(parent, + getIterator(nativeHandle_, readOptions.nativeHandle_, columnFamilyHandle.nativeHandle_)); + } + } + /** * Similar to {@link RocksDB#put(ColumnFamilyHandle, byte[], byte[])}, but * will also perform conflict checking on the keys be written. @@ -823,8 +1411,8 @@ public RocksIterator getIterator(final ReadOptions readOptions, public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value, final boolean assumeTracked) throws RocksDBException { assert (isOwningHandle()); - put(nativeHandle_, key, key.length, value, value.length, - columnFamilyHandle.nativeHandle_, assumeTracked); + put(nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_, + assumeTracked); } /** @@ -855,8 +1443,8 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value) throws RocksDBException { assert(isOwningHandle()); - put(nativeHandle_, key, key.length, value, value.length, - columnFamilyHandle.nativeHandle_, false); + put(nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_, + false); } /** @@ -884,7 +1472,7 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, public void put(final byte[] key, final byte[] value) throws RocksDBException { assert(isOwningHandle()); - put(nativeHandle_, key, key.length, value, value.length); + put(nativeHandle_, key, 0, key.length, value, 0, value.length); } //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future @@ -935,7 +1523,97 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, columnFamilyHandle.nativeHandle_, false); } - //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future + /** + * Similar to {@link RocksDB#put(byte[], byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void put(final ByteBuffer key, final ByteBuffer value) throws RocksDBException { + assert (isOwningHandle()); + if (key.isDirect() && value.isDirect()) { + putDirect(nativeHandle_, key, key.position(), key.remaining(), value, value.position(), + value.remaining()); + } else if (!key.isDirect() && !value.isDirect()) { + assert key.hasArray(); + assert value.hasArray(); + put(nativeHandle_, key.array(), key.arrayOffset() + key.position(), key.remaining(), + value.array(), value.arrayOffset() + value.position(), value.remaining()); + } else { + throw new RocksDBException(BB_ALL_DIRECT_OR_INDIRECT); + } + key.position(key.limit()); + value.position(value.limit()); + } + + /** + * Similar to {@link RocksDB#put(byte[], byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param columnFamilyHandle The column family to put the key/value into + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + * @param assumeTracked true when it is expected that the key is already + * tracked. More specifically, it means the the key was previous tracked + * in the same savepoint, with the same exclusive flag, and at a lower + * sequence number. If valid then it skips ValidateSnapshot, + * throws an error otherwise. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void put(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, + final ByteBuffer value, final boolean assumeTracked) throws RocksDBException { + assert (isOwningHandle()); + if (key.isDirect() && value.isDirect()) { + putDirect(nativeHandle_, key, key.position(), key.remaining(), value, value.position(), + value.remaining(), columnFamilyHandle.nativeHandle_, assumeTracked); + } else if (!key.isDirect() && !value.isDirect()) { + assert key.hasArray(); + assert value.hasArray(); + put(nativeHandle_, key.array(), key.arrayOffset() + key.position(), key.remaining(), + value.array(), value.arrayOffset() + value.position(), value.remaining(), + columnFamilyHandle.nativeHandle_, assumeTracked); + } else { + throw new RocksDBException(BB_ALL_DIRECT_OR_INDIRECT); + } + key.position(key.limit()); + value.position(value.limit()); + } + public void put(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, + final ByteBuffer value) throws RocksDBException { + put(columnFamilyHandle, key, value, false); + } + + // TODO(AR) refactor if we implement org.rocksdb.SliceParts in future /** * Similar to {@link #put(byte[], byte[])} but allows * you to specify the key and value in several parts that will be @@ -986,7 +1664,7 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value, final boolean assumeTracked) throws RocksDBException { assert (isOwningHandle()); - merge(nativeHandle_, key, key.length, value, value.length, + merge(nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_, assumeTracked); } @@ -1018,7 +1696,7 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle, public void merge(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value) throws RocksDBException { assert(isOwningHandle()); - merge(nativeHandle_, key, key.length, value, value.length, + merge(nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_, false); } @@ -1047,7 +1725,115 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle, public void merge(final byte[] key, final byte[] value) throws RocksDBException { assert(isOwningHandle()); - merge(nativeHandle_, key, key.length, value, value.length); + merge(nativeHandle_, key, 0, key.length, value, 0, value.length); + } + + /** + * Similar to {@link RocksDB#merge(byte[], byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param key the specified key to be merged. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void merge(final ByteBuffer key, final ByteBuffer value) throws RocksDBException { + assert (isOwningHandle()); + if (key.isDirect() && value.isDirect()) { + mergeDirect(nativeHandle_, key, key.position(), key.remaining(), value, value.position(), + value.remaining()); + } else if (!key.isDirect() && !value.isDirect()) { + assert key.hasArray(); + assert value.hasArray(); + merge(nativeHandle_, key.array(), key.arrayOffset() + key.position(), key.remaining(), + value.array(), value.arrayOffset() + value.position(), value.remaining()); + } else { + throw new RocksDBException(BB_ALL_DIRECT_OR_INDIRECT); + } + } + + /** + * Similar to {@link RocksDB#merge(byte[], byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param columnFamilyHandle in which to apply the merge + * @param key the specified key to be merged. + * @param value the value associated with the specified key. + * @param assumeTracked expects the key be already tracked. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void merge(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, + final ByteBuffer value, final boolean assumeTracked) throws RocksDBException { + assert (isOwningHandle()); + if (key.isDirect() && value.isDirect()) { + mergeDirect(nativeHandle_, key, key.position(), key.remaining(), value, value.position(), + value.remaining(), columnFamilyHandle.nativeHandle_, assumeTracked); + } else if (!key.isDirect() && !value.isDirect()) { + assert key.hasArray(); + assert value.hasArray(); + merge(nativeHandle_, key.array(), key.arrayOffset() + key.position(), key.remaining(), + value.array(), value.arrayOffset() + value.position(), value.remaining(), + columnFamilyHandle.nativeHandle_, assumeTracked); + } else { + throw new RocksDBException(BB_ALL_DIRECT_OR_INDIRECT); + } + key.position(key.limit()); + value.position(value.limit()); + } + + /** + * Similar to {@link RocksDB#merge(byte[], byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param columnFamilyHandle in which to apply the merge + * @param key the specified key to be merged. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void merge(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, + final ByteBuffer value) throws RocksDBException { + merge(columnFamilyHandle, key, value, false); } /** @@ -1473,10 +2259,50 @@ public void putUntracked(final byte[][] keyParts, final byte[][] valueParts) */ public void mergeUntracked(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value) throws RocksDBException { - mergeUntracked(nativeHandle_, key, key.length, value, value.length, + assert (isOwningHandle()); + mergeUntracked(nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } + /** + * Similar to {@link RocksDB#merge(ColumnFamilyHandle, byte[], byte[])}, + * but operates on the transactions write batch. This write will only happen + * if this transaction gets committed successfully. + * + * Unlike {@link #merge(ColumnFamilyHandle, byte[], byte[])} no conflict + * checking will be performed for this key. + * + * If this Transaction was created on a {@link TransactionDB}, this function + * will still acquire locks necessary to make sure this write doesn't cause + * conflicts in other transactions; This may cause a {@link RocksDBException} + * with associated {@link Status.Code#Busy}. + * + * @param columnFamilyHandle The column family to merge the key/value into + * @param key the specified key to be merged. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void mergeUntracked(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, + final ByteBuffer value) throws RocksDBException { + assert (isOwningHandle()); + if (key.isDirect() && value.isDirect()) { + mergeUntrackedDirect(nativeHandle_, key, key.position(), key.remaining(), value, + value.position(), value.remaining(), columnFamilyHandle.nativeHandle_); + } else if (!key.isDirect() && !value.isDirect()) { + assert key.hasArray(); + assert value.hasArray(); + mergeUntracked(nativeHandle_, key.array(), key.arrayOffset() + key.position(), + key.remaining(), value.array(), value.arrayOffset() + value.position(), value.remaining(), + columnFamilyHandle.nativeHandle_); + } else { + throw new RocksDBException(BB_ALL_DIRECT_OR_INDIRECT); + } + key.position(key.limit()); + value.position(value.limit()); + } + /** * Similar to {@link RocksDB#merge(byte[], byte[])}, * but operates on the transactions write batch. This write will only happen @@ -1498,8 +2324,30 @@ public void mergeUntracked(final ColumnFamilyHandle columnFamilyHandle, */ public void mergeUntracked(final byte[] key, final byte[] value) throws RocksDBException { - assert(isOwningHandle()); - mergeUntracked(nativeHandle_, key, key.length, value, value.length); + mergeUntracked(defaultColumnFamilyHandle, key, value); + } + + /** + * Similar to {@link RocksDB#merge(byte[], byte[])}, + * but operates on the transactions write batch. This write will only happen + * if this transaction gets committed successfully. + * + * Unlike {@link #merge(byte[], byte[])} no conflict + * checking will be performed for this key. + * + * If this Transaction was created on a {@link TransactionDB}, this function + * will still acquire locks necessary to make sure this write doesn't cause + * conflicts in other transactions; This may cause a {@link RocksDBException} + * with associated {@link Status.Code#Busy}. + * + * @param key the specified key to be merged. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void mergeUntracked(final ByteBuffer key, final ByteBuffer value) throws RocksDBException { + mergeUntracked(defaultColumnFamilyHandle, key, value); } /** @@ -2028,137 +2876,147 @@ public long[] getTransactionIds() { } } - private native void setSnapshot(final long handle); - private native void setSnapshotOnNextOperation(final long handle); - private native void setSnapshotOnNextOperation(final long handle, - final long transactionNotifierHandle); - private native long getSnapshot(final long handle); - private native void clearSnapshot(final long handle); - private native void prepare(final long handle) throws RocksDBException; - private native void commit(final long handle) throws RocksDBException; - private native void rollback(final long handle) throws RocksDBException; - private native void setSavePoint(final long handle) throws RocksDBException; - private native void rollbackToSavePoint(final long handle) - throws RocksDBException; - private native byte[] get(final long handle, final long readOptionsHandle, final byte[] key, - final int keyLength, final long columnFamilyHandle) throws RocksDBException; - private native byte[] get(final long handle, final long readOptionsHandle, final byte[] key, - final int keyLen) throws RocksDBException; - private native byte[][] multiGet(final long handle, - final long readOptionsHandle, final byte[][] keys, - final long[] columnFamilyHandles) throws RocksDBException; - private native byte[][] multiGet(final long handle, - final long readOptionsHandle, final byte[][] keys) + private static native void setSnapshot(final long handle); + private static native void setSnapshotOnNextOperation(final long handle); + private static native void setSnapshotOnNextOperation( + final long handle, final long transactionNotifierHandle); + private static native long getSnapshot(final long handle); + private static native void clearSnapshot(final long handle); + private static native void prepare(final long handle) throws RocksDBException; + private static native void commit(final long handle) throws RocksDBException; + private static native void rollback(final long handle) throws RocksDBException; + private static native void setSavePoint(final long handle) throws RocksDBException; + private static native void rollbackToSavePoint(final long handle) throws RocksDBException; + private static native byte[] get(final long handle, final long readOptionsHandle, + final byte[] key, final int keyOffset, final int keyLength, final long columnFamilyHandle) throws RocksDBException; - private native byte[] getForUpdate(final long handle, final long readOptionsHandle, - final byte[] key, final int keyLength, final long columnFamilyHandle, final boolean exclusive, - final boolean doValidate) throws RocksDBException; - private native byte[] getForUpdate(final long handle, final long readOptionsHandle, - final byte[] key, final int keyLen, final boolean exclusive, final boolean doValidate) + private static native int get(final long handle, final long readOptionsHandle, final byte[] key, + final int keyOffset, final int keyLen, final byte[] value, final int valueOffset, + final int valueLen, final long columnFamilyHandle) throws RocksDBException; + private static native int getDirect(final long handle, final long readOptionsHandle, + final ByteBuffer key, final int keyOffset, final int keyLength, final ByteBuffer value, + final int valueOffset, final int valueLength, final long columnFamilyHandle) throws RocksDBException; - private native byte[][] multiGetForUpdate(final long handle, - final long readOptionsHandle, final byte[][] keys, - final long[] columnFamilyHandles) throws RocksDBException; - private native byte[][] multiGetForUpdate(final long handle, - final long readOptionsHandle, final byte[][] keys) - throws RocksDBException; - private native long getIterator(final long handle, - final long readOptionsHandle); - private native long getIterator(final long handle, - final long readOptionsHandle, final long columnFamilyHandle); - private native void put(final long handle, final byte[] key, final int keyLength, - final byte[] value, final int valueLength, final long columnFamilyHandle, - final boolean assumeTracked) throws RocksDBException; - private native void put(final long handle, final byte[] key, - final int keyLength, final byte[] value, final int valueLength) + + private static native byte[][] multiGet(final long handle, final long readOptionsHandle, + final byte[][] keys, final long[] columnFamilyHandles) throws RocksDBException; + private static native byte[][] multiGet( + final long handle, final long readOptionsHandle, final byte[][] keys) throws RocksDBException; + private static native byte[] getForUpdate(final long handle, final long readOptionsHandle, + final byte[] key, final int keyOffset, final int keyLength, final long columnFamilyHandle, + final boolean exclusive, final boolean doValidate) throws RocksDBException; + private static native int getForUpdate(final long handle, final long readOptionsHandle, + final byte[] key, final int keyOffset, final int keyLength, final byte[] value, + final int valueOffset, final int valueLen, final long columnFamilyHandle, + final boolean exclusive, final boolean doValidate) throws RocksDBException; + private static native int getDirectForUpdate(final long handle, final long readOptionsHandle, + final ByteBuffer key, final int keyOffset, final int keyLength, final ByteBuffer value, + final int valueOffset, final int valueLen, final long columnFamilyHandle, + final boolean exclusive, final boolean doValidate) throws RocksDBException; + private static native byte[][] multiGetForUpdate(final long handle, final long readOptionsHandle, + final byte[][] keys, final long[] columnFamilyHandles) throws RocksDBException; + private static native byte[][] multiGetForUpdate( + final long handle, final long readOptionsHandle, final byte[][] keys) throws RocksDBException; + private static native long getIterator( + final long handle, final long readOptionsHandle, final long columnFamilyHandle); + private static native void put(final long handle, final byte[] key, final int keyOffset, + final int keyLength, final byte[] value, final int valueOffset, final int valueLength) throws RocksDBException; - private native void put(final long handle, final byte[][] keys, final int keysLength, + private static native void put(final long handle, final byte[] key, final int keyOffset, + final int keyLength, final byte[] value, final int valueOffset, final int valueLength, + final long columnFamilyHandle, final boolean assumeTracked) throws RocksDBException; + private static native void put(final long handle, final byte[][] keys, final int keysLength, final byte[][] values, final int valuesLength, final long columnFamilyHandle, final boolean assumeTracked) throws RocksDBException; - private native void put(final long handle, final byte[][] keys, - final int keysLength, final byte[][] values, final int valuesLength) - throws RocksDBException; - private native void merge(final long handle, final byte[] key, final int keyLength, - final byte[] value, final int valueLength, final long columnFamilyHandle, + private static native void put(final long handle, final byte[][] keys, final int keysLength, + final byte[][] values, final int valuesLength) throws RocksDBException; + private static native void putDirect(long handle, ByteBuffer key, int keyOffset, int keyLength, + ByteBuffer value, int valueOffset, int valueLength, long cfHandle, final boolean assumeTracked) throws RocksDBException; - private native void merge(final long handle, final byte[] key, - final int keyLength, final byte[] value, final int valueLength) - throws RocksDBException; - private native void delete(final long handle, final byte[] key, final int keyLength, + private static native void putDirect(long handle, ByteBuffer key, int keyOffset, int keyLength, + ByteBuffer value, int valueOffset, int valueLength) throws RocksDBException; + + private static native void merge(final long handle, final byte[] key, final int keyOffset, + final int keyLength, final byte[] value, final int valueOffset, final int valueLength, final long columnFamilyHandle, final boolean assumeTracked) throws RocksDBException; - private native void delete(final long handle, final byte[] key, - final int keyLength) throws RocksDBException; - private native void delete(final long handle, final byte[][] keys, final int keysLength, + private static native void mergeDirect(long handle, ByteBuffer key, int keyOffset, int keyLength, + ByteBuffer value, int valueOffset, int valueLength, long cfHandle, boolean assumeTracked) + throws RocksDBException; + private static native void mergeDirect(long handle, ByteBuffer key, int keyOffset, int keyLength, + ByteBuffer value, int valueOffset, int valueLength) throws RocksDBException; + + private static native void merge(final long handle, final byte[] key, final int keyOffset, + final int keyLength, final byte[] value, final int valueOffset, final int valueLength) + throws RocksDBException; + private static native void delete(final long handle, final byte[] key, final int keyLength, final long columnFamilyHandle, final boolean assumeTracked) throws RocksDBException; - private native void delete(final long handle, final byte[][] keys, - final int keysLength) throws RocksDBException; - private native void singleDelete(final long handle, final byte[] key, final int keyLength, + private static native void delete(final long handle, final byte[] key, final int keyLength) + throws RocksDBException; + private static native void delete(final long handle, final byte[][] keys, final int keysLength, final long columnFamilyHandle, final boolean assumeTracked) throws RocksDBException; - private native void singleDelete(final long handle, final byte[] key, - final int keyLength) throws RocksDBException; - private native void singleDelete(final long handle, final byte[][] keys, final int keysLength, + private static native void delete(final long handle, final byte[][] keys, final int keysLength) + throws RocksDBException; + private static native void singleDelete(final long handle, final byte[] key, final int keyLength, final long columnFamilyHandle, final boolean assumeTracked) throws RocksDBException; - private native void singleDelete(final long handle, final byte[][] keys, - final int keysLength) throws RocksDBException; - private native void putUntracked(final long handle, final byte[] key, - final int keyLength, final byte[] value, final int valueLength, - final long columnFamilyHandle) throws RocksDBException; - private native void putUntracked(final long handle, final byte[] key, - final int keyLength, final byte[] value, final int valueLength) + private static native void singleDelete(final long handle, final byte[] key, final int keyLength) + throws RocksDBException; + private static native void singleDelete(final long handle, final byte[][] keys, + final int keysLength, final long columnFamilyHandle, final boolean assumeTracked) throws RocksDBException; - private native void putUntracked(final long handle, final byte[][] keys, + private static native void singleDelete( + final long handle, final byte[][] keys, final int keysLength) throws RocksDBException; + private static native void putUntracked(final long handle, final byte[] key, final int keyLength, + final byte[] value, final int valueLength, final long columnFamilyHandle) + throws RocksDBException; + private static native void putUntracked(final long handle, final byte[] key, final int keyLength, + final byte[] value, final int valueLength) throws RocksDBException; + private static native void putUntracked(final long handle, final byte[][] keys, final int keysLength, final byte[][] values, final int valuesLength, final long columnFamilyHandle) throws RocksDBException; - private native void putUntracked(final long handle, final byte[][] keys, - final int keysLength, final byte[][] values, final int valuesLength) - throws RocksDBException; - private native void mergeUntracked(final long handle, final byte[] key, - final int keyLength, final byte[] value, final int valueLength, + private static native void putUntracked(final long handle, final byte[][] keys, + final int keysLength, final byte[][] values, final int valuesLength) throws RocksDBException; + private static native void mergeUntracked(final long handle, final byte[] key, final int keyOff, + final int keyLength, final byte[] value, final int valueOff, final int valueLength, final long columnFamilyHandle) throws RocksDBException; - private native void mergeUntracked(final long handle, final byte[] key, - final int keyLength, final byte[] value, final int valueLength) - throws RocksDBException; - private native void deleteUntracked(final long handle, final byte[] key, - final int keyLength, final long columnFamilyHandle) - throws RocksDBException; - private native void deleteUntracked(final long handle, final byte[] key, - final int keyLength) throws RocksDBException; - private native void deleteUntracked(final long handle, final byte[][] keys, - final int keysLength, final long columnFamilyHandle) - throws RocksDBException; - private native void deleteUntracked(final long handle, final byte[][] keys, - final int keysLength) throws RocksDBException; - private native void putLogData(final long handle, final byte[] blob, - final int blobLength); - private native void disableIndexing(final long handle); - private native void enableIndexing(final long handle); - private native long getNumKeys(final long handle); - private native long getNumPuts(final long handle); - private native long getNumDeletes(final long handle); - private native long getNumMerges(final long handle); - private native long getElapsedTime(final long handle); - private native long getWriteBatch(final long handle); - private native void setLockTimeout(final long handle, final long lockTimeout); - private native long getWriteOptions(final long handle); - private native void setWriteOptions(final long handle, - final long writeOptionsHandle); - private native void undoGetForUpdate(final long handle, final byte[] key, - final int keyLength, final long columnFamilyHandle); - private native void undoGetForUpdate(final long handle, final byte[] key, - final int keyLength); - private native void rebuildFromWriteBatch(final long handle, - final long writeBatchHandle) throws RocksDBException; - private native long getCommitTimeWriteBatch(final long handle); - private native void setLogNumber(final long handle, final long logNumber); - private native long getLogNumber(final long handle); - private native void setName(final long handle, final String name) + private static native void mergeUntrackedDirect(final long handle, final ByteBuffer key, + final int keyOff, final int keyLength, final ByteBuffer value, final int valueOff, + final int valueLength, final long columnFamilyHandle) throws RocksDBException; + private static native void deleteUntracked(final long handle, final byte[] key, + final int keyLength, final long columnFamilyHandle) throws RocksDBException; + private static native void deleteUntracked( + final long handle, final byte[] key, final int keyLength) throws RocksDBException; + private static native void deleteUntracked(final long handle, final byte[][] keys, + final int keysLength, final long columnFamilyHandle) throws RocksDBException; + private static native void deleteUntracked( + final long handle, final byte[][] keys, final int keysLength) throws RocksDBException; + private static native void putLogData(final long handle, final byte[] blob, final int blobLength); + private static native void disableIndexing(final long handle); + private static native void enableIndexing(final long handle); + private static native long getNumKeys(final long handle); + private static native long getNumPuts(final long handle); + private static native long getNumDeletes(final long handle); + private static native long getNumMerges(final long handle); + private static native long getElapsedTime(final long handle); + private static native long getWriteBatch(final long handle); + private static native void setLockTimeout(final long handle, final long lockTimeout); + private static native long getWriteOptions(final long handle); + private static native void setWriteOptions(final long handle, final long writeOptionsHandle); + private static native void undoGetForUpdate( + final long handle, final byte[] key, final int keyLength, final long columnFamilyHandle); + private static native void undoGetForUpdate( + final long handle, final byte[] key, final int keyLength); + private static native void rebuildFromWriteBatch(final long handle, final long writeBatchHandle) throws RocksDBException; - private native String getName(final long handle); - private native long getID(final long handle); - private native boolean isDeadlockDetect(final long handle); + private static native long getCommitTimeWriteBatch(final long handle); + private static native void setLogNumber(final long handle, final long logNumber); + private static native long getLogNumber(final long handle); + private static native void setName(final long handle, final String name) throws RocksDBException; + private static native String getName(final long handle); + private static native long getID(final long handle); + private static native boolean isDeadlockDetect(final long handle); private native WaitingTransactions getWaitingTxns(final long handle); - private native byte getState(final long handle); - private native long getId(final long handle); + private static native byte getState(final long handle); + private static native long getId(final long handle); @Override protected final native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/TransactionDB.java b/java/src/main/java/org/rocksdb/TransactionDB.java index 134a0c8a13e..0f75e5f9701 100644 --- a/java/src/main/java/org/rocksdb/TransactionDB.java +++ b/java/src/main/java/org/rocksdb/TransactionDB.java @@ -6,6 +6,7 @@ package org.rocksdb; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; @@ -50,6 +51,7 @@ public static TransactionDB open(final Options options, // the currently-created RocksDB. tdb.storeOptionsInstance(options); tdb.storeTransactionDbOptions(transactionDbOptions); + tdb.storeDefaultColumnFamilyHandle(tdb.makeDefaultColumnFamilyHandle()); return tdb; } @@ -76,7 +78,7 @@ public static TransactionDB open(final DBOptions dbOptions, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { - + int defaultColumnFamilyIndex = -1; final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; for (int i = 0; i < columnFamilyDescriptors.size(); i++) { @@ -84,6 +86,13 @@ public static TransactionDB open(final DBOptions dbOptions, .get(i); cfNames[i] = cfDescriptor.getName(); cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_; + if (Arrays.equals(cfDescriptor.getName(), RocksDB.DEFAULT_COLUMN_FAMILY)) { + defaultColumnFamilyIndex = i; + } + } + if (defaultColumnFamilyIndex < 0) { + throw new IllegalArgumentException( + "You must provide the default column family in your columnFamilyDescriptors"); } final long[] handles = open(dbOptions.nativeHandle_, @@ -99,6 +108,8 @@ public static TransactionDB open(final DBOptions dbOptions, for (int i = 1; i < handles.length; i++) { columnFamilyHandles.add(new ColumnFamilyHandle(tdb, handles[i])); } + tdb.ownedColumnFamilyHandles.addAll(columnFamilyHandles); + tdb.storeDefaultColumnFamilyHandle(columnFamilyHandles.get(defaultColumnFamilyIndex)); return tdb; } @@ -141,6 +152,12 @@ public void closeE() throws RocksDBException { @SuppressWarnings("PMD.EmptyCatchBlock") @Override public void close() { + for (final ColumnFamilyHandle columnFamilyHandle : // NOPMD - CloseResource + ownedColumnFamilyHandles) { + columnFamilyHandle.close(); + } + ownedColumnFamilyHandles.clear(); + if (owningHandle_.compareAndSet(true, false)) { try { closeDatabase(nativeHandle_); @@ -377,7 +394,11 @@ private void storeTransactionDbOptions( this.transactionDbOptions_ = transactionDbOptions; } - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); private static native long open(final long optionsHandle, final long transactionDbOptionsHandle, final String path) @@ -386,21 +407,17 @@ private static native long[] open(final long dbOptionsHandle, final long transactionDbOptionsHandle, final String path, final byte[][] columnFamilyNames, final long[] columnFamilyOptions); private static native void closeDatabase(final long handle) throws RocksDBException; - private native long beginTransaction(final long handle, - final long writeOptionsHandle); - private native long beginTransaction(final long handle, - final long writeOptionsHandle, final long transactionOptionsHandle); - private native long beginTransaction_withOld(final long handle, - final long writeOptionsHandle, final long oldTransactionHandle); - private native long beginTransaction_withOld(final long handle, + private static native long beginTransaction(final long handle, final long writeOptionsHandle); + private static native long beginTransaction( + final long handle, final long writeOptionsHandle, final long transactionOptionsHandle); + private static native long beginTransaction_withOld( + final long handle, final long writeOptionsHandle, final long oldTransactionHandle); + private static native long beginTransaction_withOld(final long handle, final long writeOptionsHandle, final long transactionOptionsHandle, final long oldTransactionHandle); - private native long getTransactionByName(final long handle, - final String name); - private native long[] getAllPreparedTransactions(final long handle); - private native Map getLockStatusData( - final long handle); - private native DeadlockPath[] getDeadlockInfoBuffer(final long handle); - private native void setDeadlockInfoBufferSize(final long handle, - final int targetSize); + private static native long getTransactionByName(final long handle, final String name); + private static native long[] getAllPreparedTransactions(final long handle); + private static native Map getLockStatusData(final long handle); + private static native DeadlockPath[] getDeadlockInfoBuffer(final long handle); + private static native void setDeadlockInfoBufferSize(final long handle, final int targetSize); } diff --git a/java/src/main/java/org/rocksdb/TransactionDBOptions.java b/java/src/main/java/org/rocksdb/TransactionDBOptions.java index 391025d6ae9..8257d50f7a6 100644 --- a/java/src/main/java/org/rocksdb/TransactionDBOptions.java +++ b/java/src/main/java/org/rocksdb/TransactionDBOptions.java @@ -199,18 +199,21 @@ public TransactionDBOptions setWritePolicy( } private static native long newTransactionDBOptions(); - private native long getMaxNumLocks(final long handle); - private native void setMaxNumLocks(final long handle, - final long maxNumLocks); - private native long getNumStripes(final long handle); - private native void setNumStripes(final long handle, final long numStripes); - private native long getTransactionLockTimeout(final long handle); - private native void setTransactionLockTimeout(final long handle, - final long transactionLockTimeout); - private native long getDefaultLockTimeout(final long handle); - private native void setDefaultLockTimeout(final long handle, - final long transactionLockTimeout); - private native byte getWritePolicy(final long handle); - private native void setWritePolicy(final long handle, final byte writePolicy); - @Override protected final native void disposeInternal(final long handle); + private static native long getMaxNumLocks(final long handle); + private static native void setMaxNumLocks(final long handle, final long maxNumLocks); + private static native long getNumStripes(final long handle); + private static native void setNumStripes(final long handle, final long numStripes); + private static native long getTransactionLockTimeout(final long handle); + private static native void setTransactionLockTimeout( + final long handle, final long transactionLockTimeout); + private static native long getDefaultLockTimeout(final long handle); + private static native void setDefaultLockTimeout( + final long handle, final long transactionLockTimeout); + private static native byte getWritePolicy(final long handle); + private static native void setWritePolicy(final long handle, final byte writePolicy); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/TransactionLogIterator.java b/java/src/main/java/org/rocksdb/TransactionLogIterator.java index 5d9ec58d77f..c32241bcd66 100644 --- a/java/src/main/java/org/rocksdb/TransactionLogIterator.java +++ b/java/src/main/java/org/rocksdb/TransactionLogIterator.java @@ -103,10 +103,13 @@ public WriteBatch writeBatch() { private final WriteBatch writeBatch_; } - @Override protected final native void disposeInternal(final long handle); - private native boolean isValid(long handle); - private native void next(long handle); - private native void status(long handle) - throws RocksDBException; - private native BatchResult getBatch(long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + private static native boolean isValid(long handle); + private static native void next(long handle); + private static native void status(long handle) throws RocksDBException; + private static native BatchResult getBatch(long handle); } diff --git a/java/src/main/java/org/rocksdb/TransactionOptions.java b/java/src/main/java/org/rocksdb/TransactionOptions.java index f93d3cb3cbb..d2efeb87ce4 100644 --- a/java/src/main/java/org/rocksdb/TransactionOptions.java +++ b/java/src/main/java/org/rocksdb/TransactionOptions.java @@ -169,21 +169,22 @@ public TransactionOptions setMaxWriteBatchSize(final long maxWriteBatchSize) { } private static native long newTransactionOptions(); - private native boolean isSetSnapshot(final long handle); - private native void setSetSnapshot(final long handle, - final boolean setSnapshot); - private native boolean isDeadlockDetect(final long handle); - private native void setDeadlockDetect(final long handle, - final boolean deadlockDetect); - private native long getLockTimeout(final long handle); - private native void setLockTimeout(final long handle, final long lockTimeout); - private native long getExpiration(final long handle); - private native void setExpiration(final long handle, final long expiration); - private native long getDeadlockDetectDepth(final long handle); - private native void setDeadlockDetectDepth(final long handle, - final long deadlockDetectDepth); - private native long getMaxWriteBatchSize(final long handle); - private native void setMaxWriteBatchSize(final long handle, - final long maxWriteBatchSize); - @Override protected final native void disposeInternal(final long handle); + private static native boolean isSetSnapshot(final long handle); + private static native void setSetSnapshot(final long handle, final boolean setSnapshot); + private static native boolean isDeadlockDetect(final long handle); + private static native void setDeadlockDetect(final long handle, final boolean deadlockDetect); + private static native long getLockTimeout(final long handle); + private static native void setLockTimeout(final long handle, final long lockTimeout); + private static native long getExpiration(final long handle); + private static native void setExpiration(final long handle, final long expiration); + private static native long getDeadlockDetectDepth(final long handle); + private static native void setDeadlockDetectDepth( + final long handle, final long deadlockDetectDepth); + private static native long getMaxWriteBatchSize(final long handle); + private static native void setMaxWriteBatchSize(final long handle, final long maxWriteBatchSize); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/TtlDB.java b/java/src/main/java/org/rocksdb/TtlDB.java index 9a90ba3586b..fa8e03598ef 100644 --- a/java/src/main/java/org/rocksdb/TtlDB.java +++ b/java/src/main/java/org/rocksdb/TtlDB.java @@ -5,6 +5,7 @@ package org.rocksdb; +import java.util.Arrays; import java.util.List; /** @@ -84,7 +85,10 @@ public static TtlDB open(final Options options, final String db_path) */ public static TtlDB open(final Options options, final String db_path, final int ttl, final boolean readOnly) throws RocksDBException { - return new TtlDB(open(options.nativeHandle_, db_path, ttl, readOnly)); + final TtlDB db = new TtlDB(open(options.nativeHandle_, db_path, ttl, readOnly)); + db.storeOptionsInstance(options); + db.storeDefaultColumnFamilyHandle(db.makeDefaultColumnFamilyHandle()); + return db; } /** @@ -116,6 +120,7 @@ public static TtlDB open(final DBOptions options, final String db_path, + " family handle."); } + int defaultColumnFamilyIndex = -1; final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; for (int i = 0; i < columnFamilyDescriptors.size(); i++) { @@ -123,6 +128,13 @@ public static TtlDB open(final DBOptions options, final String db_path, columnFamilyDescriptors.get(i); cfNames[i] = cfDescriptor.getName(); cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_; + if (Arrays.equals(cfDescriptor.getName(), RocksDB.DEFAULT_COLUMN_FAMILY)) { + defaultColumnFamilyIndex = i; + } + } + if (defaultColumnFamilyIndex < 0) { + throw new IllegalArgumentException( + "You must provide the default column family in your columnFamilyDescriptors"); } final int[] ttlVals = new int[ttlValues.size()]; @@ -136,6 +148,10 @@ public static TtlDB open(final DBOptions options, final String db_path, for (int i = 1; i < handles.length; i++) { columnFamilyHandles.add(new ColumnFamilyHandle(ttlDB, handles[i])); } + ttlDB.storeOptionsInstance(options); + ttlDB.ownedColumnFamilyHandles.addAll(columnFamilyHandles); + ttlDB.storeDefaultColumnFamilyHandle(columnFamilyHandles.get(defaultColumnFamilyIndex)); + return ttlDB; } @@ -179,6 +195,12 @@ public void closeE() throws RocksDBException { @SuppressWarnings("PMD.EmptyCatchBlock") @Override public void close() { + for (final ColumnFamilyHandle columnFamilyHandle : // NOPMD - CloseResource + ownedColumnFamilyHandles) { + columnFamilyHandle.close(); + } + ownedColumnFamilyHandles.clear(); + if (owningHandle_.compareAndSet(true, false)) { try { closeDatabase(nativeHandle_); @@ -230,15 +252,19 @@ protected TtlDB(final long nativeHandle) { super(nativeHandle); } - @Override protected native void disposeInternal(final long handle); + @Override + protected void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); private static native long open(final long optionsHandle, final String db_path, final int ttl, final boolean readOnly) throws RocksDBException; private static native long[] openCF(final long optionsHandle, final String db_path, final byte[][] columnFamilyNames, final long[] columnFamilyOptions, final int[] ttlValues, final boolean readOnly) throws RocksDBException; - private native long createColumnFamilyWithTtl(final long handle, - final byte[] columnFamilyName, final long columnFamilyOptions, int ttl) + private static native long createColumnFamilyWithTtl( + final long handle, final byte[] columnFamilyName, final long columnFamilyOptions, int ttl) throws RocksDBException; private static native void closeDatabase(final long handle) throws RocksDBException; } diff --git a/java/src/main/java/org/rocksdb/UInt64AddOperator.java b/java/src/main/java/org/rocksdb/UInt64AddOperator.java index 0cffdce8c11..536ba58d835 100644 --- a/java/src/main/java/org/rocksdb/UInt64AddOperator.java +++ b/java/src/main/java/org/rocksdb/UInt64AddOperator.java @@ -15,5 +15,9 @@ public UInt64AddOperator() { } private static native long newSharedUInt64AddOperator(); - @Override protected final native void disposeInternal(final long handle); + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); } diff --git a/java/src/main/java/org/rocksdb/VectorMemTableConfig.java b/java/src/main/java/org/rocksdb/VectorMemTableConfig.java index fb1e7a94854..d87efb1b7fb 100644 --- a/java/src/main/java/org/rocksdb/VectorMemTableConfig.java +++ b/java/src/main/java/org/rocksdb/VectorMemTableConfig.java @@ -40,7 +40,7 @@ public int reservedSize() { return newMemTableFactoryHandle(reservedSize_); } - private native long newMemTableFactoryHandle(long reservedSize) + private static native long newMemTableFactoryHandle(long reservedSize) throws IllegalArgumentException; private int reservedSize_; } diff --git a/java/src/main/java/org/rocksdb/WBWIRocksIterator.java b/java/src/main/java/org/rocksdb/WBWIRocksIterator.java index 25d6e6f9d66..5f7b7b8a1d7 100644 --- a/java/src/main/java/org/rocksdb/WBWIRocksIterator.java +++ b/java/src/main/java/org/rocksdb/WBWIRocksIterator.java @@ -40,30 +40,87 @@ public WriteEntry entry() { return entry; } - @Override protected final native void disposeInternal(final long handle); - @Override final native boolean isValid0(long handle); - @Override final native void seekToFirst0(long handle); - @Override final native void seekToLast0(long handle); - @Override final native void next0(long handle); - @Override final native void prev0(long handle); - @Override final native void refresh0(final long handle) throws RocksDBException; - @Override final native void seek0(long handle, byte[] target, int targetLen); - @Override final native void seekForPrev0(long handle, byte[] target, int targetLen); - @Override final native void status0(long handle) throws RocksDBException; + @Override final native void refresh1(long handle, long snapshotHandle); @Override - final native void seekDirect0( + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + @Override + final boolean isValid0(long handle) { + return isValid0Jni(handle); + } + private static native boolean isValid0Jni(long handle); + @Override + final void seekToFirst0(long handle) { + seekToFirst0Jni(handle); + } + private static native void seekToFirst0Jni(long handle); + @Override + final void seekToLast0(long handle) { + seekToLast0Jni(handle); + } + private static native void seekToLast0Jni(long handle); + @Override + final void next0(long handle) { + next0Jni(handle); + } + private static native void next0Jni(long handle); + @Override + final void prev0(long handle) { + prev0Jni(handle); + } + private static native void prev0Jni(long handle); + @Override + final void refresh0(final long handle) throws RocksDBException { + refresh0Jni(handle); + } + private static native void refresh0Jni(final long handle) throws RocksDBException; + @Override + final void seek0(long handle, byte[] target, int targetLen) { + seek0Jni(handle, target, targetLen); + } + private static native void seek0Jni(long handle, byte[] target, int targetLen); + @Override + final void seekForPrev0(long handle, byte[] target, int targetLen) { + seekForPrev0Jni(handle, target, targetLen); + } + private static native void seekForPrev0Jni(long handle, byte[] target, int targetLen); + @Override + final void status0(long handle) throws RocksDBException { + status0Jni(handle); + } + private static native void status0Jni(long handle) throws RocksDBException; + @Override + final void seekDirect0( + final long handle, final ByteBuffer target, final int targetOffset, final int targetLen) { + seekDirect0Jni(handle, target, targetOffset, targetLen); + } + private static native void seekDirect0Jni( final long handle, final ByteBuffer target, final int targetOffset, final int targetLen); @Override - final native void seekForPrevDirect0( + final void seekForPrevDirect0( + final long handle, final ByteBuffer target, final int targetOffset, final int targetLen) { + seekForPrevDirect0Jni(handle, target, targetOffset, targetLen); + } + private static native void seekForPrevDirect0Jni( final long handle, final ByteBuffer target, final int targetOffset, final int targetLen); @Override - final native void seekByteArray0( + final void seekByteArray0( + final long handle, final byte[] target, final int targetOffset, final int targetLen) { + seekByteArray0Jni(handle, target, targetOffset, targetLen); + } + private static native void seekByteArray0Jni( final long handle, final byte[] target, final int targetOffset, final int targetLen); @Override - final native void seekForPrevByteArray0( + final void seekForPrevByteArray0( + final long handle, final byte[] target, final int targetOffset, final int targetLen) { + seekForPrevByteArray0Jni(handle, target, targetOffset, targetLen); + } + private static native void seekForPrevByteArray0Jni( final long handle, final byte[] target, final int targetOffset, final int targetLen); - private native long[] entry1(final long handle); + private static native long[] entry1(final long handle); /** * Enumeration of the Write operation diff --git a/java/src/main/java/org/rocksdb/WriteBatch.java b/java/src/main/java/org/rocksdb/WriteBatch.java index 49e1f7f204a..1802d929c22 100644 --- a/java/src/main/java/org/rocksdb/WriteBatch.java +++ b/java/src/main/java/org/rocksdb/WriteBatch.java @@ -218,65 +218,175 @@ WriteBatch getWriteBatch(final long handle) { disOwnNativeHandle(); } - @Override protected final native void disposeInternal(final long handle); - @Override final native int count0(final long handle); - @Override final native void put(final long handle, final byte[] key, - final int keyLen, final byte[] value, final int valueLen); - @Override final native void put(final long handle, final byte[] key, - final int keyLen, final byte[] value, final int valueLen, - final long cfHandle); @Override - final native void putDirect(final long handle, final ByteBuffer key, final int keyOffset, + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + + @Override + final int count0(final long handle) { + return count0Jni(handle); + } + + private static native int count0Jni(final long handle); + + @Override + final void put(final long handle, final byte[] key, final int keyLen, final byte[] value, + final int valueLen) { + putJni(handle, key, keyLen, value, valueLen); + } + private static native void putJni(final long handle, final byte[] key, final int keyLen, + final byte[] value, final int valueLen); + + @Override + final void put(final long handle, final byte[] key, final int keyLen, final byte[] value, + final int valueLen, final long cfHandle) { + putJni(handle, key, keyLen, value, valueLen, cfHandle); + } + + private static native void putJni(final long handle, final byte[] key, final int keyLen, + final byte[] value, final int valueLen, final long cfHandle); + + @Override + final void putDirect(final long handle, final ByteBuffer key, final int keyOffset, final int keyLength, final ByteBuffer value, final int valueOffset, final int valueLength, - final long cfHandle); - @Override final native void merge(final long handle, final byte[] key, - final int keyLen, final byte[] value, final int valueLen); - @Override final native void merge(final long handle, final byte[] key, - final int keyLen, final byte[] value, final int valueLen, - final long cfHandle); - @Override final native void delete(final long handle, final byte[] key, - final int keyLen) throws RocksDBException; - @Override final native void delete(final long handle, final byte[] key, - final int keyLen, final long cfHandle) throws RocksDBException; - @Override final native void singleDelete(final long handle, final byte[] key, - final int keyLen) throws RocksDBException; - @Override final native void singleDelete(final long handle, final byte[] key, - final int keyLen, final long cfHandle) throws RocksDBException; + final long cfHandle) { + putDirectJni(handle, key, keyOffset, keyLength, value, valueOffset, valueLength, cfHandle); + } + + private static native void putDirectJni(final long handle, final ByteBuffer key, + final int keyOffset, final int keyLength, final ByteBuffer value, final int valueOffset, + final int valueLength, final long cfHandle); + + @Override + final void merge(final long handle, final byte[] key, final int keyLen, final byte[] value, + final int valueLen) { + mergeJni(handle, key, keyLen, value, valueLen); + } + + private static native void mergeJni(final long handle, final byte[] key, final int keyLen, + final byte[] value, final int valueLen); + + @Override + final void merge(final long handle, final byte[] key, final int keyLen, final byte[] value, + final int valueLen, final long cfHandle) { + mergeJni(handle, key, keyLen, value, valueLen, cfHandle); + } + private static native void mergeJni(final long handle, final byte[] key, final int keyLen, + final byte[] value, final int valueLen, final long cfHandle); + @Override - final native void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset, - final int keyLength, final long cfHandle) throws RocksDBException; + final void delete(final long handle, final byte[] key, final int keyLen) throws RocksDBException { + deleteJni(handle, key, keyLen); + } + + private static native void deleteJni(final long handle, final byte[] key, final int keyLen) + throws RocksDBException; + @Override - final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, - final byte[] endKey, final int endKeyLen); + final void delete(final long handle, final byte[] key, final int keyLen, final long cfHandle) + throws RocksDBException { + deleteJni(handle, key, keyLen, cfHandle); + } + private static native void deleteJni(final long handle, final byte[] key, final int keyLen, + final long cfHandle) throws RocksDBException; @Override - final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, - final byte[] endKey, final int endKeyLen, final long cfHandle); - @Override final native void putLogData(final long handle, - final byte[] blob, final int blobLen) throws RocksDBException; - @Override final native void clear0(final long handle); - @Override final native void setSavePoint0(final long handle); - @Override final native void rollbackToSavePoint0(final long handle); - @Override final native void popSavePoint(final long handle) throws RocksDBException; - @Override final native void setMaxBytes(final long nativeHandle, - final long maxBytes); + final void singleDelete(final long handle, final byte[] key, final int keyLen) + throws RocksDBException { + singleDeleteJni(handle, key, keyLen); + } + + private static native void singleDeleteJni(final long handle, final byte[] key, final int keyLen) + throws RocksDBException; + + @Override + final void singleDelete(final long handle, final byte[] key, final int keyLen, + final long cfHandle) throws RocksDBException { + singleDeleteJni(handle, key, keyLen, cfHandle); + } + private static native void singleDeleteJni(final long handle, final byte[] key, final int keyLen, + final long cfHandle) throws RocksDBException; + + @Override + final void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset, + final int keyLength, final long cfHandle) throws RocksDBException { + deleteDirectJni(handle, key, keyOffset, keyLength, cfHandle); + } + + private static native void deleteDirectJni(final long handle, final ByteBuffer key, + final int keyOffset, final int keyLength, final long cfHandle) throws RocksDBException; + + @Override + final void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, + final byte[] endKey, final int endKeyLen) { + deleteRangeJni(handle, beginKey, beginKeyLen, endKey, endKeyLen); + } + private static native void deleteRangeJni(final long handle, final byte[] beginKey, + final int beginKeyLen, final byte[] endKey, final int endKeyLen); + @Override + final void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, + final byte[] endKey, final int endKeyLen, final long cfHandle) { + deleteRangeJni(handle, beginKey, beginKeyLen, endKey, endKeyLen, cfHandle); + } + private static native void deleteRangeJni(final long handle, final byte[] beginKey, + final int beginKeyLen, final byte[] endKey, final int endKeyLen, final long cfHandle); + + @Override + final void putLogData(final long handle, final byte[] blob, final int blobLen) + throws RocksDBException { + putLogDataJni(handle, blob, blobLen); + } + + private static native void putLogDataJni(final long handle, final byte[] blob, final int blobLen) + throws RocksDBException; + + @Override + final void clear0(final long handle) { + clear0Jni(handle); + } + + private static native void clear0Jni(final long handle); + + @Override + final void setSavePoint0(final long handle) { + setSavePoint0Jni(handle); + } + private static native void setSavePoint0Jni(final long handle); + @Override + final void rollbackToSavePoint0(final long handle) { + rollbackToSavePoint0Jni(handle); + } + private static native void rollbackToSavePoint0Jni(final long handle); + @Override + final void popSavePoint(final long handle) throws RocksDBException { + popSavePointJni(handle); + } + private static native void popSavePointJni(final long handle) throws RocksDBException; + @Override + final void setMaxBytes(final long nativeHandle, final long maxBytes) { + setMaxBytesJni(nativeHandle, maxBytes); + } + + private static native void setMaxBytesJni(final long nativeHandle, final long maxBytes); private static native long newWriteBatch(final int reserved_bytes); private static native long newWriteBatch(final byte[] serialized, final int serializedLength); - private native void iterate(final long handle, final long handlerHandle) + private static native void iterate(final long handle, final long handlerHandle) throws RocksDBException; - private native byte[] data(final long nativeHandle) throws RocksDBException; - private native long getDataSize(final long nativeHandle); - private native boolean hasPut(final long nativeHandle); - private native boolean hasDelete(final long nativeHandle); - private native boolean hasSingleDelete(final long nativeHandle); - private native boolean hasDeleteRange(final long nativeHandle); - private native boolean hasMerge(final long nativeHandle); - private native boolean hasBeginPrepare(final long nativeHandle); - private native boolean hasEndPrepare(final long nativeHandle); - private native boolean hasCommit(final long nativeHandle); - private native boolean hasRollback(final long nativeHandle); - private native void markWalTerminationPoint(final long nativeHandle); - private native SavePoint getWalTerminationPoint(final long nativeHandle); + private static native byte[] data(final long nativeHandle) throws RocksDBException; + private static native long getDataSize(final long nativeHandle); + private static native boolean hasPut(final long nativeHandle); + private static native boolean hasDelete(final long nativeHandle); + private static native boolean hasSingleDelete(final long nativeHandle); + private static native boolean hasDeleteRange(final long nativeHandle); + private static native boolean hasMerge(final long nativeHandle); + private static native boolean hasBeginPrepare(final long nativeHandle); + private static native boolean hasEndPrepare(final long nativeHandle); + private static native boolean hasCommit(final long nativeHandle); + private static native boolean hasRollback(final long nativeHandle); + private static native void markWalTerminationPoint(final long nativeHandle); + private static native SavePoint getWalTerminationPoint(final long nativeHandle); /** * Handler callback for iterating over the contents of a batch. diff --git a/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java b/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java index d41be5856ce..a9267bb40f8 100644 --- a/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java +++ b/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java @@ -292,67 +292,188 @@ public byte[] getFromBatchAndDB(final RocksDB db, final ReadOptions options, options.nativeHandle_, key, key.length); } - @Override protected final native void disposeInternal(final long handle); - @Override final native int count0(final long handle); - @Override final native void put(final long handle, final byte[] key, - final int keyLen, final byte[] value, final int valueLen); - @Override final native void put(final long handle, final byte[] key, - final int keyLen, final byte[] value, final int valueLen, - final long cfHandle); @Override - final native void putDirect(final long handle, final ByteBuffer key, final int keyOffset, + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + @Override + final int count0(final long handle) { + return count0Jni(handle); + } + private static native int count0Jni(final long handle); + + @Override + final void put(final long handle, final byte[] key, final int keyLen, final byte[] value, + final int valueLen) { + putJni(handle, key, keyLen, value, valueLen); + } + + private static native void putJni(final long handle, final byte[] key, final int keyLen, + final byte[] value, final int valueLen); + + @Override + final void put(final long handle, final byte[] key, final int keyLen, final byte[] value, + final int valueLen, final long cfHandle) { + putJni(handle, key, keyLen, value, valueLen, cfHandle); + } + + private static native void putJni(final long handle, final byte[] key, final int keyLen, + final byte[] value, final int valueLen, final long cfHandle); + + @Override + final void putDirect(final long handle, final ByteBuffer key, final int keyOffset, final int keyLength, final ByteBuffer value, final int valueOffset, final int valueLength, - final long cfHandle); - @Override final native void merge(final long handle, final byte[] key, - final int keyLen, final byte[] value, final int valueLen); - @Override final native void merge(final long handle, final byte[] key, - final int keyLen, final byte[] value, final int valueLen, - final long cfHandle); - @Override final native void delete(final long handle, final byte[] key, - final int keyLen) throws RocksDBException; - @Override final native void delete(final long handle, final byte[] key, - final int keyLen, final long cfHandle) throws RocksDBException; - @Override final native void singleDelete(final long handle, final byte[] key, - final int keyLen) throws RocksDBException; - @Override final native void singleDelete(final long handle, final byte[] key, - final int keyLen, final long cfHandle) throws RocksDBException; + final long cfHandle) { + putDirectJni(handle, key, keyOffset, keyLength, value, valueOffset, valueLength, cfHandle); + } + + private static native void putDirectJni(final long handle, final ByteBuffer key, + final int keyOffset, final int keyLength, final ByteBuffer value, final int valueOffset, + final int valueLength, final long cfHandle); + + @Override + final void merge(final long handle, final byte[] key, final int keyLen, final byte[] value, + final int valueLen) { + mergeJni(handle, key, keyLen, value, valueLen); + } + + private static native void mergeJni(final long handle, final byte[] key, final int keyLen, + final byte[] value, final int valueLen); + + @Override + final void merge(final long handle, final byte[] key, final int keyLen, final byte[] value, + final int valueLen, final long cfHandle) { + mergeJni(handle, key, keyLen, value, valueLen, cfHandle); + } + + private static native void mergeJni(final long handle, final byte[] key, final int keyLen, + final byte[] value, final int valueLen, final long cfHandle); + @Override + final void delete(final long handle, final byte[] key, final int keyLen) throws RocksDBException { + deleteJni(handle, key, keyLen); + } + private static native void deleteJni(final long handle, final byte[] key, final int keyLen) + throws RocksDBException; + + @Override + final void delete(final long handle, final byte[] key, final int keyLen, final long cfHandle) + throws RocksDBException { + deleteJni(handle, key, keyLen, cfHandle); + } + + private static native void deleteJni(final long handle, final byte[] key, final int keyLen, + final long cfHandle) throws RocksDBException; + + @Override + final void singleDelete(final long handle, final byte[] key, final int keyLen) + throws RocksDBException { + singleDeleteJni(handle, key, keyLen); + } + + private static native void singleDeleteJni(final long handle, final byte[] key, final int keyLen) + throws RocksDBException; + + @Override + final void singleDelete(final long handle, final byte[] key, final int keyLen, + final long cfHandle) throws RocksDBException { + singleDeleteJni(handle, key, keyLen, cfHandle); + } + + private static native void singleDeleteJni(final long handle, final byte[] key, final int keyLen, + final long cfHandle) throws RocksDBException; + @Override - final native void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset, - final int keyLength, final long cfHandle) throws RocksDBException; + final void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset, + final int keyLength, final long cfHandle) throws RocksDBException { + deleteDirectJni(handle, key, keyOffset, keyLength, cfHandle); + } + + private static native void deleteDirectJni(final long handle, final ByteBuffer key, + final int keyOffset, final int keyLength, final long cfHandle) throws RocksDBException; + // DO NOT USE - `WriteBatchWithIndex::deleteRange` is not yet supported @Override - final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, - final byte[] endKey, final int endKeyLen); + final void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, + final byte[] endKey, final int endKeyLen) { + deleteRangeJni(handle, beginKey, beginKeyLen, endKey, endKeyLen); + } + + private static native void deleteRangeJni(final long handle, final byte[] beginKey, + final int beginKeyLen, final byte[] endKey, final int endKeyLen); + // DO NOT USE - `WriteBatchWithIndex::deleteRange` is not yet supported @Override - final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, - final byte[] endKey, final int endKeyLen, final long cfHandle); - @Override final native void putLogData(final long handle, final byte[] blob, - final int blobLen) throws RocksDBException; - @Override final native void clear0(final long handle); - @Override final native void setSavePoint0(final long handle); - @Override final native void rollbackToSavePoint0(final long handle); - @Override final native void popSavePoint(final long handle) throws RocksDBException; - @Override final native void setMaxBytes(final long nativeHandle, - final long maxBytes); - @Override final native WriteBatch getWriteBatch(final long handle); + final void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, + final byte[] endKey, final int endKeyLen, final long cfHandle) { + deleteRangeJni(handle, beginKey, beginKeyLen, endKey, endKeyLen, cfHandle); + } + + private static native void deleteRangeJni(final long handle, final byte[] beginKey, + final int beginKeyLen, final byte[] endKey, final int endKeyLen, final long cfHandle); + + @Override + final void putLogData(final long handle, final byte[] blob, final int blobLen) + throws RocksDBException { + putLogDataJni(handle, blob, blobLen); + } + private static native void putLogDataJni(final long handle, final byte[] blob, final int blobLen) + throws RocksDBException; + + @Override + final void clear0(final long handle) { + clear0Jni(handle); + } + + private static native void clear0Jni(final long handle); + @Override + final void setSavePoint0(final long handle) { + setSavePoint0Jni(handle); + } + private static native void setSavePoint0Jni(final long handle); + + @Override + final void rollbackToSavePoint0(final long handle) { + rollbackToSavePoint0Jni(handle); + } + + private static native void rollbackToSavePoint0Jni(final long handle); + + @Override + final void popSavePoint(final long handle) throws RocksDBException { + popSavePointJni(handle); + } + + private static native void popSavePointJni(final long handle) throws RocksDBException; + + @Override + final void setMaxBytes(final long nativeHandle, final long maxBytes) { + setMaxBytesJni(nativeHandle, maxBytes); + } + + private static native void setMaxBytesJni(final long nativeHandle, final long maxBytes); + + @Override + final WriteBatch getWriteBatch(final long handle) { + return getWriteBatchJni(handle); + } + + private static native WriteBatch getWriteBatchJni(final long handle); private static native long newWriteBatchWithIndex(); private static native long newWriteBatchWithIndex(final boolean overwriteKey); private static native long newWriteBatchWithIndex(final long fallbackIndexComparatorHandle, final byte comparatorType, final int reservedBytes, final boolean overwriteKey); - private native long iterator0(final long handle); - private native long iterator1(final long handle, final long cfHandle); - private native long iteratorWithBase(final long handle, final long cfHandle, + private static native long iterator0(final long handle); + private static native long iterator1(final long handle, final long cfHandle); + private static native long iteratorWithBase(final long handle, final long cfHandle, final long baseIteratorHandle, final long readOptionsHandle); - private native byte[] getFromBatch(final long handle, final long optHandle, - final byte[] key, final int keyLen); - private native byte[] getFromBatch(final long handle, final long optHandle, + private static native byte[] getFromBatch( + final long handle, final long optHandle, final byte[] key, final int keyLen); + private static native byte[] getFromBatch(final long handle, final long optHandle, final byte[] key, final int keyLen, final long cfHandle); - private native byte[] getFromBatchAndDB(final long handle, - final long dbHandle, final long readOptHandle, final byte[] key, - final int keyLen); - private native byte[] getFromBatchAndDB(final long handle, - final long dbHandle, final long readOptHandle, final byte[] key, - final int keyLen, final long cfHandle); + private static native byte[] getFromBatchAndDB(final long handle, final long dbHandle, + final long readOptHandle, final byte[] key, final int keyLen); + private static native byte[] getFromBatchAndDB(final long handle, final long dbHandle, + final long readOptHandle, final byte[] key, final int keyLen, final long cfHandle); } diff --git a/java/src/main/java/org/rocksdb/WriteBufferManager.java b/java/src/main/java/org/rocksdb/WriteBufferManager.java index 40176aba42f..495fbdb961b 100644 --- a/java/src/main/java/org/rocksdb/WriteBufferManager.java +++ b/java/src/main/java/org/rocksdb/WriteBufferManager.java @@ -45,7 +45,11 @@ private static native long newWriteBufferManager( final long bufferSizeBytes, final long cacheHandle, final boolean allowStall); @Override - protected native void disposeInternal(final long handle); + protected void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + + private static native void disposeInternalJni(final long handle); private final boolean allowStall_; } diff --git a/java/src/main/java/org/rocksdb/WriteOptions.java b/java/src/main/java/org/rocksdb/WriteOptions.java index 7c184b09492..571cdb100f5 100644 --- a/java/src/main/java/org/rocksdb/WriteOptions.java +++ b/java/src/main/java/org/rocksdb/WriteOptions.java @@ -235,21 +235,24 @@ public WriteOptions setMemtableInsertHintPerBatch(final boolean memtableInsertHi private static native long newWriteOptions(); private static native long copyWriteOptions(long handle); - @Override protected final native void disposeInternal(final long handle); - - private native void setSync(long handle, boolean flag); - private native boolean sync(long handle); - private native void setDisableWAL(long handle, boolean flag); - private native boolean disableWAL(long handle); - private native void setIgnoreMissingColumnFamilies(final long handle, - final boolean ignoreMissingColumnFamilies); - private native boolean ignoreMissingColumnFamilies(final long handle); - private native void setNoSlowdown(final long handle, - final boolean noSlowdown); - private native boolean noSlowdown(final long handle); - private native void setLowPri(final long handle, final boolean lowPri); - private native boolean lowPri(final long handle); - private native boolean memtableInsertHintPerBatch(final long handle); - private native void setMemtableInsertHintPerBatch( + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } + private static native void disposeInternalJni(final long handle); + + private static native void setSync(long handle, boolean flag); + private static native boolean sync(long handle); + private static native void setDisableWAL(long handle, boolean flag); + private static native boolean disableWAL(long handle); + private static native void setIgnoreMissingColumnFamilies( + final long handle, final boolean ignoreMissingColumnFamilies); + private static native boolean ignoreMissingColumnFamilies(final long handle); + private static native void setNoSlowdown(final long handle, final boolean noSlowdown); + private static native boolean noSlowdown(final long handle); + private static native void setLowPri(final long handle, final boolean lowPri); + private static native boolean lowPri(final long handle); + private static native boolean memtableInsertHintPerBatch(final long handle); + private static native void setMemtableInsertHintPerBatch( final long handle, final boolean memtableInsertHintPerBatch); } diff --git a/java/src/main/java/org/rocksdb/util/BufferUtil.java b/java/src/main/java/org/rocksdb/util/BufferUtil.java new file mode 100644 index 00000000000..54be3e6937d --- /dev/null +++ b/java/src/main/java/org/rocksdb/util/BufferUtil.java @@ -0,0 +1,16 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb.util; + +public class BufferUtil { + public static void CheckBounds(final int offset, final int len, final int size) { + if ((offset | len | (offset + len) | (size - (offset + len))) < 0) { + throw new IndexOutOfBoundsException( + String.format("offset(%d), len(%d), size(%d)", offset, len, size)); + } + } +} diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java index 53ff65d2637..78b73dc5d43 100644 --- a/java/src/main/java/org/rocksdb/util/Environment.java +++ b/java/src/main/java/org/rocksdb/util/Environment.java @@ -36,6 +36,10 @@ public static boolean isS390x() { return ARCH.contains("s390x"); } + public static boolean isRiscv64() { + return ARCH.contains("riscv64"); + } + public static boolean isWindows() { return (OS.contains("win")); } @@ -180,7 +184,7 @@ private static String getLibcPostfix() { public static String getJniLibraryName(final String name) { if (isUnix()) { final String arch = is64Bit() ? "64" : "32"; - if (isPowerPC() || isAarch64()) { + if (isPowerPC() || isAarch64() || isRiscv64()) { return String.format("%sjni-linux-%s%s", name, ARCH, getLibcPostfix()); } else if (isS390x()) { return String.format("%sjni-linux-%s", name, ARCH); diff --git a/java/src/main/java/org/rocksdb/util/StdErrLogger.java b/java/src/main/java/org/rocksdb/util/StdErrLogger.java new file mode 100644 index 00000000000..00b08d38452 --- /dev/null +++ b/java/src/main/java/org/rocksdb/util/StdErrLogger.java @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb.util; + +import org.rocksdb.InfoLogLevel; +import org.rocksdb.LoggerInterface; +import org.rocksdb.LoggerType; +import org.rocksdb.RocksObject; + +/** + * Simply redirects all log messages to StdErr. + */ +public class StdErrLogger extends RocksObject implements LoggerInterface { + /** + * Constructs a new StdErrLogger. + * + * @param logLevel the level at which to log. + */ + public StdErrLogger(final InfoLogLevel logLevel) { + this(logLevel, null); + } + + /** + * Constructs a new StdErrLogger. + * + * @param logLevel the level at which to log. + * @param logPrefix the string with which to prefix all log messages. + */ + public StdErrLogger(final InfoLogLevel logLevel, /* @Nullable */ final String logPrefix) { + super(newStdErrLogger(logLevel.getValue(), logPrefix)); + } + + @Override + public void setInfoLogLevel(final InfoLogLevel logLevel) { + setInfoLogLevel(nativeHandle_, logLevel.getValue()); + } + + @Override + public InfoLogLevel infoLogLevel() { + return InfoLogLevel.getInfoLogLevel(infoLogLevel(nativeHandle_)); + } + + @Override + public LoggerType getLoggerType() { + return LoggerType.STDERR_IMPLEMENTATION; + } + + private static native long newStdErrLogger( + final byte logLevel, /* @Nullable */ final String logPrefix); + private static native void setInfoLogLevel(final long handle, final byte logLevel); + private static native byte infoLogLevel(final long handle); + + @Override protected native void disposeInternal(final long handle); +} diff --git a/java/src/test/java/org/rocksdb/AbstractTransactionTest.java b/java/src/test/java/org/rocksdb/AbstractTransactionTest.java index d5725800902..2977d78fd2e 100644 --- a/java/src/test/java/org/rocksdb/AbstractTransactionTest.java +++ b/java/src/test/java/org/rocksdb/AbstractTransactionTest.java @@ -8,10 +8,12 @@ import static java.nio.charset.StandardCharsets.UTF_8; import static org.assertj.core.api.Assertions.assertThat; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Random; +import java.util.function.Function; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -181,8 +183,10 @@ public void getPut_cf() throws RocksDBException { final ReadOptions readOptions = new ReadOptions(); final Transaction txn = dbContainer.beginTransaction()) { final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + assertThat(txn.get(readOptions, testCf, k1)).isNull(); assertThat(txn.get(testCf, readOptions, k1)).isNull(); txn.put(testCf, k1, v1); + assertThat(txn.get(readOptions, testCf, k1)).isEqualTo(v1); assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1); } } @@ -200,6 +204,135 @@ public void getPut() throws RocksDBException { } } + @Test + public void getPutTargetBuffer_cf() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + try (final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + final byte[] target = "overwrite1".getBytes(UTF_8); + GetStatus status = txn.get(readOptions, testCf, k1, target); + assertThat(status.status.getCode()).isEqualTo(Status.Code.NotFound); + assertThat(status.requiredSize).isEqualTo(0); + txn.put(testCf, k1, v1); + status = txn.get(readOptions, testCf, k1, target); + assertThat(status.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(status.requiredSize).isEqualTo(v1.length); + assertThat(target).isEqualTo("value1ite1".getBytes()); + } + } + + @Test + public void getPutTargetBuffer() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + try (final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final byte[] target = "overwrite1".getBytes(UTF_8); + GetStatus status = txn.get(readOptions, k1, target); + assertThat(status.status.getCode()).isEqualTo(Status.Code.NotFound); + assertThat(status.requiredSize).isEqualTo(0); + txn.put(k1, v1); + status = txn.get(readOptions, k1, target); + assertThat(status.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(status.requiredSize).isEqualTo(v1.length); + assertThat(target).isEqualTo("value1ite1".getBytes()); + } + } + + public void getPutByteBuffer(final Function allocateBuffer) + throws RocksDBException { + final ByteBuffer k1 = allocateBuffer.apply(100).put("key1".getBytes(UTF_8)); + k1.flip(); + final ByteBuffer v1 = allocateBuffer.apply(100).put("value1".getBytes(UTF_8)); + v1.flip(); + final ByteBuffer vEmpty = allocateBuffer.apply(0); + + try (final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ByteBuffer vGet = allocateBuffer.apply(100); + assertThat(txn.get(readOptions, k1, vGet).status.getCode()).isEqualTo(Status.Code.NotFound); + txn.put(k1, v1); + + final GetStatus getStatusError = txn.get(readOptions, k1, vEmpty); + assertThat(getStatusError.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(getStatusError.requiredSize).isEqualTo("value1".getBytes(UTF_8).length); + assertThat(vEmpty.position()).isEqualTo(0); + assertThat(vEmpty.remaining()).isEqualTo(0); + + vGet.put("12345".getBytes(UTF_8)); + + final GetStatus getStatus = txn.get(readOptions, k1, vGet); + assertThat(getStatusError.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(getStatusError.requiredSize).isEqualTo("value1".getBytes(UTF_8).length); + + vGet.put("67890".getBytes(UTF_8)); + vGet.flip(); + final byte[] bytes = new byte[vGet.limit()]; + vGet.get(bytes); + assertThat(new String(bytes, UTF_8)).isEqualTo("12345value167890"); + } + } + + @Test + public void getPutDirectByteBuffer() throws RocksDBException { + getPutByteBuffer(ByteBuffer::allocateDirect); + } + + @Test + public void getPutIndirectByteBuffer() throws RocksDBException { + getPutByteBuffer(ByteBuffer::allocate); + } + + public void getPutByteBuffer_cf(final Function allocateBuffer) + throws RocksDBException { + final ByteBuffer k1 = allocateBuffer.apply(100).put("key1".getBytes(UTF_8)); + k1.flip(); + final ByteBuffer v1 = allocateBuffer.apply(100).put("value1".getBytes(UTF_8)); + v1.flip(); + final ByteBuffer vEmpty = allocateBuffer.apply(0); + + try (final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + final ByteBuffer vGet = allocateBuffer.apply(100); + assertThat(txn.get(readOptions, testCf, k1, vGet).status.getCode()) + .isEqualTo(Status.Code.NotFound); + txn.put(testCf, k1, v1); + + final GetStatus getStatusError = txn.get(readOptions, testCf, k1, vEmpty); + assertThat(getStatusError.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(getStatusError.requiredSize).isEqualTo("value1".getBytes(UTF_8).length); + assertThat(vEmpty.position()).isEqualTo(0); + assertThat(vEmpty.remaining()).isEqualTo(0); + + vGet.put("12345".getBytes(UTF_8)); + final GetStatus getStatus = txn.get(readOptions, testCf, k1, vGet); + assertThat(getStatus.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(getStatus.requiredSize).isEqualTo("value1".getBytes(UTF_8).length); + vGet.put("67890".getBytes(UTF_8)); + vGet.flip(); + final byte[] bytes = new byte[vGet.limit()]; + vGet.get(bytes); + assertThat(new String(bytes, UTF_8)).isEqualTo("12345value167890"); + } + } + + @Test + public void getPutDirectByteBuffer_cf() throws RocksDBException { + getPutByteBuffer_cf(ByteBuffer::allocateDirect); + } + + @Test + public void getPutIndirectByteBuffer_cf() throws RocksDBException { + getPutByteBuffer_cf(ByteBuffer::allocate); + } + @Test public void multiGetPut_cf() throws RocksDBException { final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)}; @@ -300,6 +433,162 @@ public void getForUpdate() throws RocksDBException { } } + @Test + public void getForUpdateByteArray_cf_doValidate() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + try (final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + final byte[] vNonExistent = new byte[1]; + final GetStatus sNonExistent = + txn.getForUpdate(readOptions, testCf, k1, vNonExistent, true, true); + assertThat(sNonExistent.status.getCode()).isEqualTo(Status.Code.NotFound); + txn.put(testCf, k1, v1); + final byte[] vPartial = new byte[4]; + final GetStatus sPartial = txn.getForUpdate(readOptions, testCf, k1, vPartial, true, true); + assertThat(sPartial.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(sPartial.requiredSize).isEqualTo(v1.length); + assertThat(vPartial).isEqualTo(Arrays.copyOfRange(v1, 0, vPartial.length)); + + final byte[] vTotal = new byte[sPartial.requiredSize]; + final GetStatus sTotal = txn.getForUpdate(readOptions, testCf, k1, vTotal, true, true); + assertThat(sTotal.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(sTotal.requiredSize).isEqualTo(v1.length); + assertThat(vTotal).isEqualTo(v1); + } + } + + public void getForUpdateByteArray_cf() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + try (final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + final byte[] vNonExistent = new byte[1]; + final GetStatus sNonExistent = txn.getForUpdate(readOptions, testCf, k1, vNonExistent, true); + assertThat(sNonExistent.status.getCode()).isEqualTo(Status.Code.NotFound); + txn.put(testCf, k1, v1); + final byte[] vPartial = new byte[4]; + final GetStatus sPartial = txn.getForUpdate(readOptions, testCf, k1, vPartial, true); + assertThat(sPartial.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(sPartial.requiredSize).isEqualTo(v1.length); + assertThat(vPartial).isEqualTo(Arrays.copyOfRange(v1, 0, vPartial.length)); + + final byte[] vTotal = new byte[sPartial.requiredSize]; + final GetStatus sTotal = txn.getForUpdate(readOptions, testCf, k1, vTotal, true); + assertThat(sTotal.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(sTotal.requiredSize).isEqualTo(v1.length); + assertThat(vTotal).isEqualTo(v1); + } + } + + @Test + public void getForUpdateByteArray() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + try (final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final byte[] vNonExistent = new byte[1]; + final GetStatus sNonExistent = txn.getForUpdate(readOptions, k1, vNonExistent, true); + assertThat(sNonExistent.status.getCode()).isEqualTo(Status.Code.NotFound); + txn.put(k1, v1); + final byte[] vPartial = new byte[4]; + final GetStatus sPartial = txn.getForUpdate(readOptions, k1, vPartial, true); + assertThat(sPartial.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(sPartial.requiredSize).isEqualTo(v1.length); + assertThat(vPartial).isEqualTo(Arrays.copyOfRange(v1, 0, vPartial.length)); + + final byte[] vTotal = new byte[sPartial.requiredSize]; + final GetStatus sTotal = txn.getForUpdate(readOptions, k1, vTotal, true); + assertThat(sTotal.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(sTotal.requiredSize).isEqualTo(v1.length); + assertThat(vTotal).isEqualTo(v1); + } + } + + @Test + public void getForUpdateDirectByteBuffer() throws Exception { + getForUpdateByteBuffer(ByteBuffer::allocateDirect); + } + + @Test + public void getForUpdateIndirectByteBuffer() throws Exception { + getForUpdateByteBuffer(ByteBuffer::allocate); + } + + public void getForUpdateByteBuffer(final Function allocateBuffer) + throws Exception { + final ByteBuffer k1 = allocateBuffer.apply(20).put("key1".getBytes(UTF_8)); + k1.flip(); + final ByteBuffer v1 = allocateBuffer.apply(20).put("value1".getBytes(UTF_8)); + v1.flip(); + try (final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ByteBuffer v1Read1 = allocateBuffer.apply(20); + final GetStatus getStatus1 = txn.getForUpdate(readOptions, k1, v1Read1, true); + assertThat(getStatus1.status.getCode()).isEqualTo(Status.Code.NotFound); + txn.put(k1, v1); + final ByteBuffer v1Read2 = allocateBuffer.apply(20); + final GetStatus getStatus2 = txn.getForUpdate(readOptions, k1, v1Read2, true); + assertThat(getStatus2.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(getStatus2.requiredSize).isEqualTo("value1".getBytes(UTF_8).length); + assertThat(v1Read2).isEqualTo(allocateBuffer.apply(20).put("value1".getBytes(UTF_8))); + } + } + + @Test + public void getForUpdateDirectByteBuffer_cf() throws Exception { + getForUpdateByteBuffer_cf(ByteBuffer::allocateDirect); + } + + @Test + public void getForUpdateIndirectByteBuffer_cf() throws Exception { + getForUpdateByteBuffer_cf(ByteBuffer::allocate); + } + + public void getForUpdateByteBuffer_cf(final Function allocateBuffer) + throws Exception { + final ByteBuffer k1 = allocateBuffer.apply(20).put("key1".getBytes(UTF_8)); + k1.flip(); + final ByteBuffer v1 = allocateBuffer.apply(20).put("value1".getBytes(UTF_8)); + v1.flip(); + final ByteBuffer k2 = allocateBuffer.apply(20).put("key2".getBytes(UTF_8)); + k2.flip(); + final ByteBuffer v2 = allocateBuffer.apply(20).put("value2".getBytes(UTF_8)); + v2.flip(); + try (final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + final ByteBuffer v1Read1 = allocateBuffer.apply(20); + final GetStatus getStatus1 = txn.getForUpdate(readOptions, testCf, k1, v1Read1, true); + assertThat(getStatus1.status.getCode()).isEqualTo(Status.Code.NotFound); + txn.put(k1, v1); + k1.flip(); + v1.flip(); + txn.put(testCf, k2, v2); + k2.flip(); + v2.flip(); + final ByteBuffer v1Read2 = allocateBuffer.apply(20); + final GetStatus getStatus2 = txn.getForUpdate(readOptions, testCf, k1, v1Read2, true); + assertThat(getStatus2.status.getCode()).isEqualTo(Status.Code.NotFound); + k1.flip(); + txn.put(testCf, k1, v1); + k1.flip(); + v1.flip(); + final ByteBuffer v1Read3 = allocateBuffer.apply(20); + final GetStatus getStatus3 = txn.getForUpdate(readOptions, testCf, k1, v1Read3, true); + assertThat(getStatus3.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(getStatus3.requiredSize).isEqualTo("value1".getBytes(UTF_8).length); + assertThat(v1Read3).isEqualTo(allocateBuffer.apply(20).put("value1".getBytes(UTF_8))); + } + } + @Test public void multiGetForUpdate_cf() throws RocksDBException { final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)}; @@ -401,6 +690,13 @@ public void getIterator() throws RocksDBException { assertThat(iterator.key()).isEqualTo(k1); assertThat(iterator.value()).isEqualTo(v1); } + + try (final RocksIterator iterator = txn.getIterator()) { + iterator.seek(k1); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo(k1); + assertThat(iterator.value()).isEqualTo(v1); + } } } @@ -422,6 +718,13 @@ public void getIterator_cf() throws RocksDBException { assertThat(iterator.key()).isEqualTo(k1); assertThat(iterator.value()).isEqualTo(v1); } + + try (final RocksIterator iterator = txn.getIterator(testCf)) { + iterator.seek(k1); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo(k1); + assertThat(iterator.value()).isEqualTo(v1); + } } } @@ -429,11 +732,15 @@ public void getIterator_cf() throws RocksDBException { public void merge_cf() throws RocksDBException { final byte[] k1 = "key1".getBytes(UTF_8); final byte[] v1 = "value1".getBytes(UTF_8); + final byte[] v2 = "value2".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final Transaction txn = dbContainer.beginTransaction()) { final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); - txn.merge(testCf, k1, v1); + txn.put(testCf, k1, v1); + txn.merge(testCf, k1, v2); + assertThat(txn.get(new ReadOptions(), testCf, k1)).isEqualTo("value1**value2".getBytes()); + assertThat(txn.get(testCf, new ReadOptions(), k1)).isEqualTo("value1**value2".getBytes()); } } @@ -441,13 +748,94 @@ public void merge_cf() throws RocksDBException { public void merge() throws RocksDBException { final byte[] k1 = "key1".getBytes(UTF_8); final byte[] v1 = "value1".getBytes(UTF_8); + final byte[] v2 = "value2".getBytes(UTF_8); + + try (final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.put(k1, v1); + txn.merge(k1, v2); + assertThat(txn.get(new ReadOptions(), k1)).isEqualTo("value1++value2".getBytes()); + } + } + + @Test + public void mergeDirectByteBuffer() throws RocksDBException { + final ByteBuffer k1 = ByteBuffer.allocateDirect(100).put("key1".getBytes(UTF_8)); + final ByteBuffer v1 = ByteBuffer.allocateDirect(100).put("value1".getBytes(UTF_8)); + final ByteBuffer v2 = ByteBuffer.allocateDirect(100).put("value2".getBytes(UTF_8)); + k1.flip(); + v1.flip(); + v2.flip(); + + try (final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.put(k1, v1); + k1.flip(); + v1.flip(); + txn.merge(k1, v2); + assertThat(txn.get(new ReadOptions(), "key1".getBytes(UTF_8))) + .isEqualTo("value1++value2".getBytes()); + } + } + + public void mergeIndirectByteBuffer() throws RocksDBException { + final ByteBuffer k1 = ByteBuffer.allocate(100).put("key1".getBytes(UTF_8)); + k1.flip(); + final ByteBuffer v1 = ByteBuffer.allocate(100).put("value1".getBytes(UTF_8)); + v1.flip(); + final ByteBuffer v2 = ByteBuffer.allocate(100).put("value2".getBytes(UTF_8)); + v2.flip(); try(final DBContainer dbContainer = startDb(); final Transaction txn = dbContainer.beginTransaction()) { - txn.merge(k1, v1); + txn.put(k1, v1); + txn.merge(k1, v2); + assertThat(txn.get(new ReadOptions(), "key1".getBytes(UTF_8))) + .isEqualTo("value1++value2".getBytes()); + } + } + + @Test + public void mergeDirectByteBuffer_cf() throws RocksDBException { + final ByteBuffer k1 = ByteBuffer.allocateDirect(100).put("key1".getBytes(UTF_8)); + final ByteBuffer v1 = ByteBuffer.allocateDirect(100).put("value1".getBytes(UTF_8)); + final ByteBuffer v2 = ByteBuffer.allocateDirect(100).put("value2".getBytes(UTF_8)); + k1.flip(); + v1.flip(); + v2.flip(); + + try (final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + txn.put(testCf, k1, v1); + k1.flip(); + v1.flip(); + txn.merge(testCf, k1, v2); + assertThat(txn.get(new ReadOptions(), testCf, "key1".getBytes(UTF_8))) + .isEqualTo("value1**value2".getBytes()); + assertThat(txn.get(testCf, new ReadOptions(), "key1".getBytes(UTF_8))) + .isEqualTo("value1**value2".getBytes()); } } + public void mergeIndirectByteBuffer_cf() throws RocksDBException { + final ByteBuffer k1 = ByteBuffer.allocate(100).put("key1".getBytes(UTF_8)); + k1.flip(); + final ByteBuffer v1 = ByteBuffer.allocate(100).put("value1".getBytes(UTF_8)); + v1.flip(); + final ByteBuffer v2 = ByteBuffer.allocate(100).put("value2".getBytes(UTF_8)); + v2.flip(); + try (final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + txn.put(testCf, k1, v1); + txn.merge(testCf, k1, v2); + assertThat(txn.get(new ReadOptions(), testCf, "key1".getBytes(UTF_8))) + .isEqualTo("value1**value2".getBytes()); + assertThat(txn.get(testCf, new ReadOptions(), "key1".getBytes(UTF_8))) + .isEqualTo("value1**value2".getBytes()); + } + } @Test public void delete_cf() throws RocksDBException { @@ -459,9 +847,11 @@ public void delete_cf() throws RocksDBException { final Transaction txn = dbContainer.beginTransaction()) { final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); txn.put(testCf, k1, v1); + assertThat(txn.get(readOptions, testCf, k1)).isEqualTo(v1); assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1); txn.delete(testCf, k1); + assertThat(txn.get(readOptions, testCf, k1)).isNull(); assertThat(txn.get(testCf, readOptions, k1)).isNull(); } } @@ -495,11 +885,12 @@ public void delete_parts_cf() throws RocksDBException { final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); txn.put(testCf, keyParts, valueParts); assertThat(txn.get(testCf, readOptions, key)).isEqualTo(value); + assertThat(txn.get(readOptions, testCf, key)).isEqualTo(value); txn.delete(testCf, keyParts); - assertThat(txn.get(testCf, readOptions, key)) - .isNull(); + assertThat(txn.get(readOptions, testCf, key)).isNull(); + assertThat(txn.get(testCf, readOptions, key)).isNull(); } } @@ -532,8 +923,10 @@ public void getPutUntracked_cf() throws RocksDBException { final ReadOptions readOptions = new ReadOptions(); final Transaction txn = dbContainer.beginTransaction()) { final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + assertThat(txn.get(readOptions, testCf, k1)).isNull(); assertThat(txn.get(testCf, readOptions, k1)).isNull(); txn.putUntracked(testCf, k1, v1); + assertThat(txn.get(readOptions, testCf, k1)).isEqualTo(v1); assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1); } } @@ -628,11 +1021,19 @@ public void multiGetPutAsListUntracked() throws RocksDBException { public void mergeUntracked_cf() throws RocksDBException { final byte[] k1 = "key1".getBytes(UTF_8); final byte[] v1 = "value1".getBytes(UTF_8); + final byte[] v2 = "value2".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final Transaction txn = dbContainer.beginTransaction()) { final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); txn.mergeUntracked(testCf, k1, v1); + txn.mergeUntracked(testCf, k1, v2); + txn.commit(); + } + try (final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + assertThat(txn.get(new ReadOptions(), testCf, k1)).isEqualTo("value1**value2".getBytes()); } } @@ -640,10 +1041,89 @@ public void mergeUntracked_cf() throws RocksDBException { public void mergeUntracked() throws RocksDBException { final byte[] k1 = "key1".getBytes(UTF_8); final byte[] v1 = "value1".getBytes(UTF_8); + final byte[] v2 = "value2".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final Transaction txn = dbContainer.beginTransaction()) { txn.mergeUntracked(k1, v1); + txn.mergeUntracked(k1, v2); + txn.commit(); + } + try (final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.get(new ReadOptions(), k1)).isEqualTo("value1++value2".getBytes()); + } + } + + @Test + public void mergeUntrackedByteBuffer() throws RocksDBException { + final ByteBuffer k1 = ByteBuffer.allocateDirect(20).put("key1".getBytes(UTF_8)); + final ByteBuffer v1 = ByteBuffer.allocateDirect(20).put("value1".getBytes(UTF_8)); + final ByteBuffer v2 = ByteBuffer.allocateDirect(20).put("value2".getBytes(UTF_8)); + k1.flip(); + v1.flip(); + v2.flip(); + + try (final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.mergeUntracked(k1, v1); + k1.flip(); + v1.flip(); + txn.mergeUntracked(k1, v2); + k1.flip(); + v2.flip(); + txn.commit(); + } + + try (final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + final ByteBuffer v = ByteBuffer.allocateDirect(20); + final GetStatus status = txn.get(new ReadOptions(), k1, v); + assertThat(status.status.getCode()).isEqualTo(Status.Code.Ok); + k1.flip(); + v.flip(); + final int expectedLength = "value1++value2".length(); + assertThat(v.remaining()).isEqualTo(expectedLength); + final byte[] vBytes = new byte[expectedLength]; + v.get(vBytes); + assertThat(vBytes).isEqualTo("value1++value2".getBytes()); + } + } + + @Test + public void mergeUntrackedByteBuffer_cf() throws RocksDBException { + final ByteBuffer k1 = ByteBuffer.allocateDirect(20).put("key1".getBytes(UTF_8)); + final ByteBuffer v1 = ByteBuffer.allocateDirect(20).put("value1".getBytes(UTF_8)); + final ByteBuffer v2 = ByteBuffer.allocateDirect(20).put("value2".getBytes(UTF_8)); + k1.flip(); + v1.flip(); + v2.flip(); + + try (final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + txn.mergeUntracked(testCf, k1, v1); + k1.flip(); + v1.flip(); + txn.mergeUntracked(testCf, k1, v2); + k1.flip(); + v2.flip(); + txn.commit(); + } + + try (final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + final ByteBuffer v = ByteBuffer.allocateDirect(20); + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + final GetStatus status = txn.get(new ReadOptions(), testCf, k1, v); + assertThat(status.status.getCode()).isEqualTo(Status.Code.Ok); + k1.flip(); + v.flip(); + final int expectedLength = "value1++value2".length(); + assertThat(v.remaining()).isEqualTo(expectedLength); + final byte[] vBytes = new byte[expectedLength]; + v.get(vBytes); + assertThat(vBytes).isEqualTo("value1**value2".getBytes()); } } @@ -657,9 +1137,11 @@ public void deleteUntracked_cf() throws RocksDBException { final Transaction txn = dbContainer.beginTransaction()) { final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); txn.put(testCf, k1, v1); + assertThat(txn.get(readOptions, testCf, k1)).isEqualTo(v1); assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1); txn.deleteUntracked(testCf, k1); + assertThat(txn.get(readOptions, testCf, k1)).isNull(); assertThat(txn.get(testCf, readOptions, k1)).isNull(); } } @@ -692,9 +1174,11 @@ public void deleteUntracked_parts_cf() throws RocksDBException { final Transaction txn = dbContainer.beginTransaction()) { final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); txn.put(testCf, keyParts, valueParts); + assertThat(txn.get(readOptions, testCf, key)).isEqualTo(value); assertThat(txn.get(testCf, readOptions, key)).isEqualTo(value); txn.deleteUntracked(testCf, keyParts); + assertThat(txn.get(readOptions, testCf, key)).isNull(); assertThat(txn.get(testCf, readOptions, key)).isNull(); } } diff --git a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java index fb8a4508550..16b3317e7ad 100644 --- a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java +++ b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java @@ -147,6 +147,228 @@ public void openWithColumnFamilies() throws RocksDBException { } } + @Test + public void defaultColumnFamilySynonyms() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("dfkey_syn_1".getBytes(), "dfvalue_syn_1".getBytes()); + } + + final List cfNames = + Arrays.asList(new ColumnFamilyDescriptor("new_cf1".getBytes()), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf2".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) { + assertThat(columnFamilyHandleList.size()).isEqualTo(3); + assertThat(db.get(columnFamilyHandleList.get(1), "dfkey_syn_1".getBytes())) + .isEqualTo("dfvalue_syn_1".getBytes()); + db.put(columnFamilyHandleList.get(1), "dfkey_syn_2".getBytes(), "dfvalue_syn_2".getBytes()); + } + + final List cfNames2 = + Arrays.asList(new ColumnFamilyDescriptor("new_cf1".getBytes()), + new ColumnFamilyDescriptor("new_cf2".getBytes()), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List columnFamilyHandleList2 = new ArrayList<>(); + + try (final RocksDB db = RocksDB.open(new DBOptions(), dbFolder.getRoot().getAbsolutePath(), + cfNames2, columnFamilyHandleList2)) { + assertThat(db.get("dfkey_syn_2".getBytes())).isEqualTo("dfvalue_syn_2".getBytes()); + } + } + + @Test + public void defaultColumnFamilySynonymsReadOnly() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("dfkey_syn_1".getBytes(), "dfvalue_syn_1".getBytes()); + } + + final List cfNames = + Arrays.asList(new ColumnFamilyDescriptor("new_cf1".getBytes()), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf2".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) { + assertThat(columnFamilyHandleList.size()).isEqualTo(3); + assertThat(db.get(columnFamilyHandleList.get(1), "dfkey_syn_1".getBytes())) + .isEqualTo("dfvalue_syn_1".getBytes()); + db.put(columnFamilyHandleList.get(1), "dfkey_syn_2".getBytes(), "dfvalue_syn_2".getBytes()); + } + + final List cfNames2 = + Arrays.asList(new ColumnFamilyDescriptor("new_cf1".getBytes()), + new ColumnFamilyDescriptor("new_cf2".getBytes()), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List columnFamilyHandleList2 = new ArrayList<>(); + + try (final RocksDB db = RocksDB.openReadOnly(new DBOptions(), + dbFolder.getRoot().getAbsolutePath(), cfNames2, columnFamilyHandleList2)) { + assertThat(db.get("dfkey_syn_2".getBytes())).isEqualTo("dfvalue_syn_2".getBytes()); + } + } + + @Test + public void defaultColumnFamilySynonymsOTDB() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final OptimisticTransactionDB db = + OptimisticTransactionDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("dfkey_syn_1".getBytes(), "dfvalue_syn_1".getBytes()); + } + + final List cfNames = + Arrays.asList(new ColumnFamilyDescriptor("new_cf1".getBytes()), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf2".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final OptimisticTransactionDB db = OptimisticTransactionDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) { + assertThat(columnFamilyHandleList.size()).isEqualTo(3); + assertThat(db.get(columnFamilyHandleList.get(1), "dfkey_syn_1".getBytes())) + .isEqualTo("dfvalue_syn_1".getBytes()); + db.put(columnFamilyHandleList.get(1), "dfkey_syn_2".getBytes(), "dfvalue_syn_2".getBytes()); + } + + final List cfNames2 = + Arrays.asList(new ColumnFamilyDescriptor("new_cf1".getBytes()), + new ColumnFamilyDescriptor("new_cf2".getBytes()), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List columnFamilyHandleList2 = new ArrayList<>(); + + try (final OptimisticTransactionDB db = OptimisticTransactionDB.open(new DBOptions(), + dbFolder.getRoot().getAbsolutePath(), cfNames2, columnFamilyHandleList2)) { + assertThat(db.get("dfkey_syn_2".getBytes())).isEqualTo("dfvalue_syn_2".getBytes()); + } + } + + @Test + public void defaultColumnFamilySynonymsTDB() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions transactionDBOptions = new TransactionDBOptions(); + final TransactionDB db = TransactionDB.open( + options, transactionDBOptions, dbFolder.getRoot().getAbsolutePath())) { + db.put("dfkey_syn_1".getBytes(), "dfvalue_syn_1".getBytes()); + } + + final List cfNames = + Arrays.asList(new ColumnFamilyDescriptor("new_cf1".getBytes()), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf2".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final TransactionDBOptions transactionDBOptions = new TransactionDBOptions(); + final TransactionDB db = TransactionDB.open(options, transactionDBOptions, + dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) { + assertThat(columnFamilyHandleList.size()).isEqualTo(3); + assertThat(db.get(columnFamilyHandleList.get(1), "dfkey_syn_1".getBytes())) + .isEqualTo("dfvalue_syn_1".getBytes()); + db.put(columnFamilyHandleList.get(1), "dfkey_syn_2".getBytes(), "dfvalue_syn_2".getBytes()); + } + + final List cfNames2 = + Arrays.asList(new ColumnFamilyDescriptor("new_cf1".getBytes()), + new ColumnFamilyDescriptor("new_cf2".getBytes()), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List columnFamilyHandleList2 = new ArrayList<>(); + + try (final TransactionDBOptions transactionDBOptions = new TransactionDBOptions(); + final TransactionDB db = TransactionDB.open(new DBOptions(), transactionDBOptions, + dbFolder.getRoot().getAbsolutePath(), cfNames2, columnFamilyHandleList2)) { + assertThat(db.get("dfkey_syn_2".getBytes())).isEqualTo("dfvalue_syn_2".getBytes()); + } + } + + @Test + public void defaultColumnFamilySynonymsTTLDB() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TtlDB db = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("dfkey_syn_1".getBytes(), "dfvalue_syn_1".getBytes()); + } + + final List cfNames = + Arrays.asList(new ColumnFamilyDescriptor("new_cf1".getBytes()), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf2".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final TtlDB db = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfNames, + columnFamilyHandleList, Arrays.asList(10, 10, 10), false)) { + assertThat(columnFamilyHandleList.size()).isEqualTo(3); + assertThat(db.get(columnFamilyHandleList.get(1), "dfkey_syn_1".getBytes())) + .isEqualTo("dfvalue_syn_1".getBytes()); + db.put(columnFamilyHandleList.get(1), "dfkey_syn_2".getBytes(), "dfvalue_syn_2".getBytes()); + } + + final List cfNames2 = + Arrays.asList(new ColumnFamilyDescriptor("new_cf1".getBytes()), + new ColumnFamilyDescriptor("new_cf2".getBytes()), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List columnFamilyHandleList2 = new ArrayList<>(); + + try (final TtlDB db = TtlDB.open(new DBOptions(), dbFolder.getRoot().getAbsolutePath(), + cfNames2, columnFamilyHandleList2, Arrays.asList(10, 10, 10), false)) { + assertThat(db.get("dfkey_syn_2".getBytes())).isEqualTo("dfvalue_syn_2".getBytes()); + } + } + + @Test(expected = IllegalArgumentException.class) + public void openColumnFamiliesNoDefault() throws RocksDBException { + try (final DBOptions dbOptions = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions myCfOpts = new ColumnFamilyOptions()) { + final List columnFamilyDescriptors = + Collections.singletonList(new ColumnFamilyDescriptor("myCf".getBytes(), myCfOpts)); + + final List columnFamilyHandles = new ArrayList<>(); + + RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, + columnFamilyHandles); + } + } + + @Test(expected = IllegalArgumentException.class) + public void openColumnFamiliesNoDefaultReadOnly() throws RocksDBException { + try (final DBOptions dbOptions = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions myCfOpts = new ColumnFamilyOptions()) { + final List columnFamilyDescriptors = + Arrays.asList(new ColumnFamilyDescriptor("myCf".getBytes()), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + + final List columnFamilyHandles = new ArrayList<>(); + + RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, + columnFamilyHandles); + } + + try (final DBOptions dbOptions = new DBOptions()) { + final List columnFamilyDescriptors = + Collections.singletonList(new ColumnFamilyDescriptor("myCf".getBytes())); + + final List columnFamilyHandles = new ArrayList<>(); + + final RocksDB db = RocksDB.openReadOnly(dbOptions, dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles); + db.close(); + } + } + @Test public void getWithOutValueAndCf() throws RocksDBException { final List cfDescriptors = @@ -169,13 +391,92 @@ public void getWithOutValueAndCf() throws RocksDBException { assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND); // found value which fits in outValue getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), outValue); - assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(getResult).isEqualTo("value".getBytes().length); assertThat(outValue).isEqualTo("value".getBytes()); // found value which fits partially - getResult = - db.get(columnFamilyHandleList.get(0), new ReadOptions(), "key2".getBytes(), outValue); - assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); - assertThat(outValue).isEqualTo("12345".getBytes()); + } + } + + @Test + public void getWithOutValueAndCfPartial() throws RocksDBException { + final List cfDescriptors = + Collections.singletonList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List columnFamilyHandleList = new ArrayList<>(); + + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList)) { + db.put(columnFamilyHandleList.get(0), "key1".getBytes(), "value".getBytes()); + db.put("key2".getBytes(), "12345678".getBytes()); + + final byte[] partialOutValue = new byte[5]; + int getResult = db.get(columnFamilyHandleList.get(0), "key2".getBytes(), partialOutValue); + assertThat(getResult).isEqualTo("12345678".getBytes().length); + assertThat(partialOutValue).isEqualTo("12345".getBytes()); + + final byte[] offsetKeyValue = "abckey2hjk".getBytes(); + assertThat(offsetKeyValue.length).isEqualTo(10); + final byte[] offsetOutValue = "abcdefghjk".getBytes(); + assertThat(offsetOutValue.length).isEqualTo(10); + + getResult = db.get(columnFamilyHandleList.get(0), offsetKeyValue, 3, 4, offsetOutValue, 2, 5); + assertThat(getResult).isEqualTo("12345678".getBytes().length); + assertThat(offsetOutValue).isEqualTo("ab12345hjk".getBytes()); + } + } + + @Test + public void getWithOutValueAndCfPartialAndOptions() throws RocksDBException { + final List cfDescriptors = + Collections.singletonList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List columnFamilyHandleList = new ArrayList<>(); + + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList)) { + db.put( + columnFamilyHandleList.get(0), new WriteOptions(), "key1".getBytes(), "value".getBytes()); + db.put("key2".getBytes(), "12345678".getBytes()); + + final byte[] partialOutValue = new byte[5]; + int getResult = db.get( + columnFamilyHandleList.get(0), new ReadOptions(), "key2".getBytes(), partialOutValue); + assertThat(getResult).isEqualTo("12345678".getBytes().length); + assertThat(partialOutValue).isEqualTo("12345".getBytes()); + + final byte[] offsetKeyValue = "abckey2hjk".getBytes(); + assertThat(offsetKeyValue.length).isEqualTo(10); + final byte[] offsetOutValue = "abcdefghjk".getBytes(); + assertThat(offsetOutValue.length).isEqualTo(10); + + getResult = db.get(columnFamilyHandleList.get(0), new ReadOptions(), offsetKeyValue, 3, 4, + offsetOutValue, 2, 5); + assertThat(getResult).isEqualTo("12345678".getBytes().length); + assertThat(offsetOutValue).isEqualTo("ab12345hjk".getBytes()); + } + } + + @Test(expected = IndexOutOfBoundsException.class) + public void getWithOutValueAndCfIndexOutOfBounds() throws RocksDBException { + final List cfDescriptors = + Collections.singletonList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List columnFamilyHandleList = new ArrayList<>(); + + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList)) { + db.put( + columnFamilyHandleList.get(0), new WriteOptions(), "key1".getBytes(), "value".getBytes()); + db.put("key2".getBytes(), "12345678".getBytes()); + + final byte[] offsetKeyValue = "abckey2hjk".getBytes(); + final byte[] partialOutValue = new byte[5]; + + int getResult = db.get(columnFamilyHandleList.get(0), new ReadOptions(), offsetKeyValue, 3, 4, + partialOutValue, 2, 5); } } @@ -297,8 +598,14 @@ public void iteratorOnColumnFamily() throws RocksDBException { } } - @Test - public void multiGet() throws RocksDBException { + @FunctionalInterface + public interface RocksDBTriFunction { + R apply(T1 t1, T2 t2, T3 t3) throws IllegalArgumentException, RocksDBException; + } + + private void multiGetHelper( + RocksDBTriFunction, List, List> multiGetter) + throws RocksDBException { final List cfDescriptors = Arrays.asList( new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), new ColumnFamilyDescriptor("new_cf".getBytes())); @@ -314,17 +621,24 @@ public void multiGet() throws RocksDBException { final List keys = Arrays.asList("key".getBytes(), "newcfkey".getBytes()); - List retValues = db.multiGetAsList(columnFamilyHandleList, keys); - assertThat(retValues.size()).isEqualTo(2); - assertThat(new String(retValues.get(0))).isEqualTo("value"); - assertThat(new String(retValues.get(1))).isEqualTo("value"); - retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, keys); + List retValues = multiGetter.apply(db, columnFamilyHandleList, keys); assertThat(retValues.size()).isEqualTo(2); assertThat(new String(retValues.get(0))).isEqualTo("value"); assertThat(new String(retValues.get(1))).isEqualTo("value"); } } + @Test + public void multiGet() throws RocksDBException { + multiGetHelper(RocksDB::multiGetAsList); + } + + @Test + public void multiGetReadOptions() throws RocksDBException { + multiGetHelper( + (db, columnFamilies, keys) -> db.multiGetAsList(new ReadOptions(), columnFamilies, keys)); + } + @Test public void multiGetAsList() throws RocksDBException { final List cfDescriptors = Arrays.asList( diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java index cb7eabcfb11..189acdb4a13 100644 --- a/java/src/test/java/org/rocksdb/DBOptionsTest.java +++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java @@ -453,16 +453,6 @@ public void setWriteBufferManagerWithZeroBufferSize() throws RocksDBException { } } - @SuppressWarnings("deprecated") - @Test - public void accessHintOnCompactionStart() { - try(final DBOptions opt = new DBOptions()) { - final AccessHint accessHint = AccessHint.SEQUENTIAL; - opt.setAccessHintOnCompactionStart(accessHint); - assertThat(opt.accessHintOnCompactionStart()).isEqualTo(accessHint); - } - } - @Test public void compactionReadaheadSize() { try(final DBOptions opt = new DBOptions()) { diff --git a/java/src/test/java/org/rocksdb/EventListenerTest.java b/java/src/test/java/org/rocksdb/EventListenerTest.java index 84be232f972..da41ae087a9 100644 --- a/java/src/test/java/org/rocksdb/EventListenerTest.java +++ b/java/src/test/java/org/rocksdb/EventListenerTest.java @@ -13,12 +13,14 @@ import java.util.concurrent.atomic.AtomicBoolean; import org.assertj.core.api.AbstractObjectAssert; import org.assertj.core.api.ObjectAssert; +import org.junit.Assume; import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import org.rocksdb.AbstractEventListener.EnabledEventCallback; import org.rocksdb.test.TestableEventListener; +import org.rocksdb.util.Environment; public class EventListenerTest { @ClassRule @@ -264,7 +266,7 @@ public void testAllCallbacksInvocation() { final MemTableInfo memTableInfoTestData = new MemTableInfo( "columnFamilyName", TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL); final FileOperationInfo fileOperationInfoTestData = new FileOperationInfo("/file/path", - TEST_LONG_VAL, TEST_LONG_VAL, 1_600_699_420_000_000_000L, 5_000_000_000L, statusTestData); + TEST_LONG_VAL, 4096, 1_600_699_420_000_000_000L, 5_000_000_000L, statusTestData); final WriteStallInfo writeStallInfoTestData = new WriteStallInfo("columnFamilyName", (byte) 0x0, (byte) 0x1); final ExternalFileIngestionInfo externalFileIngestionInfoTestData = diff --git a/java/src/test/java/org/rocksdb/KeyMayExistTest.java b/java/src/test/java/org/rocksdb/KeyMayExistTest.java index 3f3bec6bad7..5a9ffd6eb50 100644 --- a/java/src/test/java/org/rocksdb/KeyMayExistTest.java +++ b/java/src/test/java/org/rocksdb/KeyMayExistTest.java @@ -261,10 +261,12 @@ public void keyMayExistBB() throws RocksDBException { keyBuffer.flip(); assertThat(db.keyMayExist(keyBuffer)).isEqualTo(true); + keyBuffer.flip(); final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24); valueBuffer.position(12); KeyMayExist keyMayExist = db.keyMayExist(keyBuffer, valueBuffer); + keyBuffer.flip(); assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); assertThat(keyMayExist.valueLength).isEqualTo(value.length); assertThat(valueBuffer.position()).isEqualTo(12); @@ -303,10 +305,11 @@ public void keyMayExistBBReadOptions() throws RocksDBException { try (final ReadOptions readOptions = new ReadOptions()) { assertThat(db.keyMayExist(readOptions, keyBuffer)).isEqualTo(true); - + keyBuffer.flip(); final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24); valueBuffer.position(12); KeyMayExist keyMayExist = db.keyMayExist(readOptions, keyBuffer, valueBuffer); + keyBuffer.flip(); assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); assertThat(keyMayExist.valueLength).isEqualTo(value.length); assertThat(valueBuffer.position()).isEqualTo(12); @@ -318,6 +321,7 @@ public void keyMayExistBBReadOptions() throws RocksDBException { valueBuffer.limit(value.length + 24); valueBuffer.position(25); keyMayExist = db.keyMayExist(readOptions, keyBuffer, valueBuffer); + keyBuffer.flip(); assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); assertThat(keyMayExist.valueLength).isEqualTo(value.length); assertThat(valueBuffer.position()).isEqualTo(25); @@ -362,7 +366,9 @@ public void keyMayExistBBCF() throws RocksDBException { keyBuffer.flip(); assertThat(db.keyMayExist(keyBuffer)).isEqualTo(true); + keyBuffer.flip(); assertThat(db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer)).isEqualTo(false); + keyBuffer.flip(); assertThat(db.keyMayExist(columnFamilyHandleList.get(0), keyBuffer)).isEqualTo(true); // 1 is just a CF @@ -372,8 +378,11 @@ public void keyMayExistBBCF() throws RocksDBException { keyBuffer.flip(); assertThat(db.keyMayExist(keyBuffer)).isEqualTo(false); + keyBuffer.flip(); assertThat(db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer)).isEqualTo(true); + keyBuffer.flip(); assertThat(db.keyMayExist(columnFamilyHandleList.get(0), keyBuffer)).isEqualTo(false); + keyBuffer.flip(); exceptionRule.expect(AssertionError.class); exceptionRule.expectMessage( @@ -395,8 +404,10 @@ public void keyMayExistBBCFReadOptions() throws RocksDBException { try (final ReadOptions readOptions = new ReadOptions()) { assertThat(db.keyMayExist(keyBuffer)).isEqualTo(true); + keyBuffer.flip(); assertThat(db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer)) .isEqualTo(false); + keyBuffer.flip(); assertThat(db.keyMayExist(columnFamilyHandleList.get(0), readOptions, keyBuffer)) .isEqualTo(true); @@ -407,8 +418,10 @@ public void keyMayExistBBCFReadOptions() throws RocksDBException { keyBuffer.flip(); assertThat(db.keyMayExist(readOptions, keyBuffer)).isEqualTo(false); + keyBuffer.flip(); assertThat(db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer)) .isEqualTo(true); + keyBuffer.flip(); assertThat(db.keyMayExist(columnFamilyHandleList.get(0), readOptions, keyBuffer)) .isEqualTo(false); @@ -432,10 +445,11 @@ public void keyMayExistBBCFOffset() throws RocksDBException { keyBuffer.flip(); assertThat(db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer)).isEqualTo(true); - + keyBuffer.flip(); final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24); valueBuffer.position(12); KeyMayExist keyMayExist = db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer, valueBuffer); + keyBuffer.flip(); assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); assertThat(keyMayExist.valueLength).isEqualTo(value.length); assertThat(valueBuffer.position()).isEqualTo(12); @@ -474,11 +488,12 @@ public void keyMayExistBBCFOffsetReadOptions() throws RocksDBException { try (final ReadOptions readOptions = new ReadOptions()) { assertThat(db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer)) .isEqualTo(true); - + keyBuffer.flip(); final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24); valueBuffer.position(12); KeyMayExist keyMayExist = db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer, valueBuffer); + keyBuffer.flip(); assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); assertThat(keyMayExist.valueLength).isEqualTo(value.length); assertThat(valueBuffer.position()).isEqualTo(12); @@ -491,6 +506,7 @@ public void keyMayExistBBCFOffsetReadOptions() throws RocksDBException { valueBuffer.position(25); keyMayExist = db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer, valueBuffer); + keyBuffer.flip(); assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); assertThat(keyMayExist.valueLength).isEqualTo(value.length); assertThat(valueBuffer.position()).isEqualTo(25); diff --git a/java/src/test/java/org/rocksdb/LoggerTest.java b/java/src/test/java/org/rocksdb/LoggerTest.java index b6a7be55e7f..c174be52f02 100644 --- a/java/src/test/java/org/rocksdb/LoggerTest.java +++ b/java/src/test/java/org/rocksdb/LoggerTest.java @@ -232,4 +232,39 @@ protected void log(final InfoLogLevel infoLogLevel, final String logMsg) { } } } + + @Test + public void logLevelLogger() throws RocksDBException { + final AtomicInteger logMessageCounter = new AtomicInteger(); + try (final DBOptions options = new DBOptions().setCreateIfMissing(true); + final Logger logger = new Logger(InfoLogLevel.FATAL_LEVEL) { + // Create new logger with max log level passed by options + @Override + protected void log(final InfoLogLevel infoLogLevel, final String logMsg) { + assertThat(logMsg).isNotNull(); + assertThat(logMsg.length()).isGreaterThan(0); + logMessageCounter.incrementAndGet(); + } + }) { + // Set custom logger to options + options.setLogger(logger); + + final List cfDescriptors = + Collections.singletonList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List cfHandles = new ArrayList<>(); + + try (final RocksDB db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, cfHandles)) { + try { + // there should be zero messages + // using fatal level as log level. + assertThat(logMessageCounter.get()).isEqualTo(0); + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : cfHandles) { + columnFamilyHandle.close(); + } + } + } + } + } } diff --git a/java/src/test/java/org/rocksdb/MergeCFVariantsTest.java b/java/src/test/java/org/rocksdb/MergeCFVariantsTest.java new file mode 100644 index 00000000000..6c4f07ddc9a --- /dev/null +++ b/java/src/test/java/org/rocksdb/MergeCFVariantsTest.java @@ -0,0 +1,126 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.rocksdb.MergeTest.longFromByteArray; +import static org.rocksdb.MergeTest.longToByteArray; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +@RunWith(Parameterized.class) +public class MergeCFVariantsTest { + @FunctionalInterface + interface FunctionMerge { + public void apply(PDatabase db, PColumnFamilyHandle one, PLeft two, PRight three) + throws RocksDBException; + } + + @Parameterized.Parameters + public static List> data() { + return Arrays.asList(RocksDB::merge, + (db, cfh, left, right) + -> db.merge(cfh, new WriteOptions(), left, right), + (db, cfh, left, right) + -> { + final byte[] left0 = + ("1234567" + new String(left, StandardCharsets.UTF_8) + "890").getBytes(); + final byte[] right0 = + ("1234" + new String(right, StandardCharsets.UTF_8) + "567890ab").getBytes(); + db.merge(cfh, left0, 7, left.length, right0, 4, right.length); + }, + (db, cfh, left, right) + -> { + final byte[] left0 = + ("1234567" + new String(left, StandardCharsets.UTF_8) + "890").getBytes(); + final byte[] right0 = + ("1234" + new String(right, StandardCharsets.UTF_8) + "567890ab").getBytes(); + db.merge(cfh, new WriteOptions(), left0, 7, left.length, right0, 4, right.length); + }, + (db, cfh, left, right) + -> { + final ByteBuffer bbLeft = ByteBuffer.allocateDirect(100); + final ByteBuffer bbRight = ByteBuffer.allocateDirect(100); + bbLeft.put(left).flip(); + bbRight.put(right).flip(); + db.merge(cfh, new WriteOptions(), bbLeft, bbRight); + }, + (db, cfh, left, right) -> { + final ByteBuffer bbLeft = ByteBuffer.allocate(100); + final ByteBuffer bbRight = ByteBuffer.allocate(100); + bbLeft.put(left).flip(); + bbRight.put(right).flip(); + db.merge(cfh, new WriteOptions(), bbLeft, bbRight); + }); + } + + @Parameterized.Parameter + public FunctionMerge mergeFunction; + + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void cFUInt64AddOperatorOption() throws InterruptedException, RocksDBException { + try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator(); + final ColumnFamilyOptions cfOpt1 = + new ColumnFamilyOptions().setMergeOperator(uint64AddOperator); + final ColumnFamilyOptions cfOpt2 = + new ColumnFamilyOptions().setMergeOperator(uint64AddOperator)) { + final List cfDescriptors = + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt1), + new ColumnFamilyDescriptor("new_cf".getBytes(), cfOpt2)); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions opt = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open( + opt, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { + try { + // writing (long)100 under key + db.put(columnFamilyHandleList.get(1), "cfkey".getBytes(), longToByteArray(100)); + // merge (long)1 under key + mergeFunction.apply( + db, columnFamilyHandleList.get(1), "cfkey".getBytes(), longToByteArray(1)); + byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes()); + long longValue = longFromByteArray(value); + + // Test also with createColumnFamily + try (final ColumnFamilyOptions cfHandleOpts = + new ColumnFamilyOptions().setMergeOperator(uint64AddOperator); + final ColumnFamilyHandle cfHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("new_cf2".getBytes(), cfHandleOpts))) { + // writing (long)200 under cfkey2 + db.put(cfHandle, "cfkey2".getBytes(), longToByteArray(200)); + // merge (long)50 under cfkey2 + db.merge(cfHandle, new WriteOptions(), "cfkey2".getBytes(), longToByteArray(50)); + value = db.get(cfHandle, "cfkey2".getBytes()); + long longValueTmpCf = longFromByteArray(value); + + assertThat(longValue).isEqualTo(101); + assertThat(longValueTmpCf).isEqualTo(250); + } + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { + columnFamilyHandle.close(); + } + } + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/MergeTest.java b/java/src/test/java/org/rocksdb/MergeTest.java index f99ac49d3dd..10ffeb7788b 100644 --- a/java/src/test/java/org/rocksdb/MergeTest.java +++ b/java/src/test/java/org/rocksdb/MergeTest.java @@ -45,14 +45,14 @@ public void stringOption() } } - private byte[] longToByteArray(final long l) { + static byte[] longToByteArray(final long l) { final ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE).order(ByteOrder.LITTLE_ENDIAN); buf.putLong(l); return buf.array(); } - private long longFromByteArray(final byte[] a) { + static long longFromByteArray(final byte[] a) { final ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE).order(ByteOrder.LITTLE_ENDIAN); buf.put(a); diff --git a/java/src/test/java/org/rocksdb/MergeVariantsTest.java b/java/src/test/java/org/rocksdb/MergeVariantsTest.java new file mode 100644 index 00000000000..1acedc1e694 --- /dev/null +++ b/java/src/test/java/org/rocksdb/MergeVariantsTest.java @@ -0,0 +1,95 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.rocksdb.MergeTest.longFromByteArray; +import static org.rocksdb.MergeTest.longToByteArray; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +@RunWith(Parameterized.class) +public class MergeVariantsTest { + @FunctionalInterface + interface FunctionMerge { + public void apply(PDatabase db, PLeft two, PRight three) throws RocksDBException; + } + + @Parameterized.Parameters + public static List> data() { + return Arrays.asList(RocksDB::merge, + (db, left, right) + -> db.merge(new WriteOptions(), left, right), + (db, left, right) + -> { + final byte[] left0 = + ("1234567" + new String(left, StandardCharsets.UTF_8) + "890").getBytes(); + final byte[] right0 = + ("1234" + new String(right, StandardCharsets.UTF_8) + "567890ab").getBytes(); + db.merge(left0, 7, left.length, right0, 4, right.length); + }, + (db, left, right) + -> { + final byte[] left0 = + ("1234567" + new String(left, StandardCharsets.UTF_8) + "890").getBytes(); + final byte[] right0 = + ("1234" + new String(right, StandardCharsets.UTF_8) + "567890ab").getBytes(); + db.merge(new WriteOptions(), left0, 7, left.length, right0, 4, right.length); + }, + (db, left, right) + -> { + final ByteBuffer bbLeft = ByteBuffer.allocateDirect(100); + final ByteBuffer bbRight = ByteBuffer.allocateDirect(100); + bbLeft.put(left).flip(); + bbRight.put(right).flip(); + db.merge(new WriteOptions(), bbLeft, bbRight); + }, + (db, left, right) -> { + final ByteBuffer bbLeft = ByteBuffer.allocate(100); + final ByteBuffer bbRight = ByteBuffer.allocate(100); + bbLeft.put(left).flip(); + bbRight.put(right).flip(); + db.merge(new WriteOptions(), bbLeft, bbRight); + }); + } + + @Parameterized.Parameter + public MergeVariantsTest.FunctionMerge mergeFunction; + + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void uint64AddOperatorOption() throws InterruptedException, RocksDBException { + try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator(); + final Options opt = + new Options().setCreateIfMissing(true).setMergeOperator(uint64AddOperator); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + // Writing (long)100 under key + db.put("key".getBytes(), longToByteArray(100)); + + // Writing (long)1 under key + mergeFunction.apply(db, "key".getBytes(), longToByteArray(1)); + + final byte[] value = db.get("key".getBytes()); + final long longValue = longFromByteArray(value); + + assertThat(longValue).isEqualTo(101); + } + } +} diff --git a/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java b/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java index e66eef6229a..03daec0c4b9 100644 --- a/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java +++ b/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java @@ -7,11 +7,14 @@ import static org.assertj.core.api.Assertions.assertThat; import java.util.*; +import org.junit.Assume; +import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; +import org.rocksdb.util.Environment; @RunWith(Parameterized.class) public class MultiGetManyKeysTest { @@ -28,6 +31,12 @@ public MultiGetManyKeysTest(final Integer numKeys) { this.numKeys = numKeys; } + @BeforeClass + public static void beforeAllTest() { + Assume.assumeFalse("We are not running this test on 32bit systems dues to memory constraints", + !Environment.is64Bit()); + } + /** * Test for multiGet problem */ diff --git a/java/src/test/java/org/rocksdb/MultiGetTest.java b/java/src/test/java/org/rocksdb/MultiGetTest.java index c391d81f631..74b13469844 100644 --- a/java/src/test/java/org/rocksdb/MultiGetTest.java +++ b/java/src/test/java/org/rocksdb/MultiGetTest.java @@ -4,14 +4,14 @@ // (found in the LICENSE.Apache file in the root directory). package org.rocksdb; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.fail; +import static org.assertj.core.api.Assertions.*; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -24,8 +24,13 @@ public class MultiGetTest { @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); - @Test - public void putNThenMultiGet() throws RocksDBException { + @FunctionalInterface + public interface RocksDBBiFunction { + R apply(T1 t1, T2 t2) throws RocksDBException; + } + + private void putNThenMultiGetHelper( + RocksDBBiFunction, List> multiGetter) throws RocksDBException { try (final Options opt = new Options().setCreateIfMissing(true); final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { db.put("key1".getBytes(), "value1ForKey1".getBytes()); @@ -33,7 +38,7 @@ public void putNThenMultiGet() throws RocksDBException { db.put("key3".getBytes(), "value3ForKey3".getBytes()); final List keys = Arrays.asList("key1".getBytes(), "key2".getBytes(), "key3".getBytes()); - final List values = db.multiGetAsList(keys); + final List values = multiGetter.apply(db, keys); assertThat(values.size()).isEqualTo(keys.size()); assertThat(values.get(0)).isEqualTo("value1ForKey1".getBytes()); assertThat(values.get(1)).isEqualTo("value2ForKey2".getBytes()); @@ -41,6 +46,42 @@ public void putNThenMultiGet() throws RocksDBException { } } + private void putNThenMultiGetHelperWithMissing( + RocksDBBiFunction, List> multiGetter) throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1ForKey1".getBytes()); + db.put("key3".getBytes(), "value3ForKey3".getBytes()); + final List keys = + Arrays.asList("key1".getBytes(), "key2".getBytes(), "key3".getBytes()); + final List values = multiGetter.apply(db, keys); + assertThat(values.size()).isEqualTo(keys.size()); + assertThat(values.get(0)).isEqualTo("value1ForKey1".getBytes()); + assertThat(values.get(1)).isEqualTo(null); + assertThat(values.get(2)).isEqualTo("value3ForKey3".getBytes()); + } + } + + @Test + public void putNThenMultiGet() throws RocksDBException { + putNThenMultiGetHelper(RocksDB::multiGetAsList); + } + + @Test + public void putNThenMultiGetWithMissing() throws RocksDBException { + putNThenMultiGetHelperWithMissing(RocksDB::multiGetAsList); + } + + @Test + public void putNThenMultiGetReadOptions() throws RocksDBException { + putNThenMultiGetHelper((db, keys) -> db.multiGetAsList(new ReadOptions(), keys)); + } + + @Test + public void putNThenMultiGetReadOptionsWithMissing() throws RocksDBException { + putNThenMultiGetHelperWithMissing((db, keys) -> db.multiGetAsList(new ReadOptions(), keys)); + } + @Test public void putNThenMultiGetDirect() throws RocksDBException { try (final Options opt = new Options().setCreateIfMissing(true); @@ -103,6 +144,65 @@ public void putNThenMultiGetDirect() throws RocksDBException { } } + @Test + public void putNThenMultiGetDirectWithMissing() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1ForKey1".getBytes()); + db.put("key3".getBytes(), "value3ForKey3".getBytes()); + + final List keys = new ArrayList<>(); + keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes())); + // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\ + for (final ByteBuffer key : keys) { + key.flip(); + } + final List values = new ArrayList<>(); + for (int i = 0; i < keys.size(); i++) { + values.add(ByteBuffer.allocateDirect(24)); + } + + { + final List results = db.multiGetByteBuffers(keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.NotFound); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length); + assertThat(results.get(1).requiredSize).isEqualTo(0); + assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(0).value)) + .isEqualTo("value1ForKey1".getBytes()); + assertThat(results.get(1).value).isNull(); + assertThat(TestUtil.bufferBytes(results.get(2).value)) + .isEqualTo("value3ForKey3".getBytes()); + } + + { + final List results = + db.multiGetByteBuffers(new ReadOptions(), keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.NotFound); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length); + assertThat(results.get(1).requiredSize).isEqualTo(0); + assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(0).value)) + .isEqualTo("value1ForKey1".getBytes()); + assertThat(results.get(1).value).isNull(); + assertThat(TestUtil.bufferBytes(results.get(2).value)) + .isEqualTo("value3ForKey3".getBytes()); + } + } + } + @Test public void putNThenMultiGetDirectSliced() throws RocksDBException { try (final Options opt = new Options().setCreateIfMissing(true); @@ -146,6 +246,47 @@ public void putNThenMultiGetDirectSliced() throws RocksDBException { } } + @Test + public void putNThenMultiGetDirectSlicedWithMissing() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1ForKey1".getBytes()); + db.put("key3".getBytes(), "value3ForKey3".getBytes()); + + final List keys = new ArrayList<>(); + keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes())); + keys.add( + ByteBuffer.allocateDirect(12).put("prefix1".getBytes()).slice().put("key1".getBytes())); + // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\ + for (final ByteBuffer key : keys) { + key.flip(); + } + final List values = new ArrayList<>(); + for (int i = 0; i < keys.size(); i++) { + values.add(ByteBuffer.allocateDirect(24)); + } + + { + final List results = db.multiGetByteBuffers(keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.NotFound); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(1).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + assertThat(results.get(2).requiredSize).isEqualTo("value1ForKey1".getBytes().length); + assertThat(results.get(0).requiredSize).isEqualTo(0); + + assertThat(results.get(0).value).isNull(); + assertThat(TestUtil.bufferBytes(results.get(1).value)) + .isEqualTo("value3ForKey3".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(2).value)) + .isEqualTo("value1ForKey1".getBytes()); + } + } + } + @Test public void putNThenMultiGetDirectBadValuesArray() throws RocksDBException { try (final Options opt = new Options().setCreateIfMissing(true); @@ -315,6 +456,39 @@ public void putNThenMultiGetDirectNondefaultCF() throws RocksDBException { assertThat(TestUtil.bufferBytes(results.get(2).value)) .isEqualTo("value3ForKey3".getBytes()); } + + { + final List columnFamilyHandles = new ArrayList<>(); + columnFamilyHandles.add(cf.get(0)); + columnFamilyHandles.add(cf.get(0)); + columnFamilyHandles.add(cf.get(0)); + + final List keysWithMissing = new ArrayList<>(); + keysWithMissing.add(ByteBuffer.allocateDirect(12).put("key1".getBytes())); + keysWithMissing.add(ByteBuffer.allocateDirect(12).put("key3Bad".getBytes())); + keysWithMissing.add(ByteBuffer.allocateDirect(12).put("key3".getBytes())); + // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\ + for (final ByteBuffer key : keysWithMissing) { + key.flip(); + } + + final List results = + db.multiGetByteBuffers(columnFamilyHandles, keysWithMissing, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.NotFound); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length); + assertThat(results.get(1).requiredSize).isEqualTo(0); + assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(0).value)) + .isEqualTo("value1ForKey1".getBytes()); + assertThat(results.get(1).value).isNull(); + assertThat(TestUtil.bufferBytes(results.get(2).value)) + .isEqualTo("value3ForKey3".getBytes()); + } } } @@ -527,4 +701,321 @@ public void putNThenMultiGetDirectTruncateCF() throws RocksDBException { } } } + + /** + * + * @param db database to write to + * @param key key to write + * @return expected size of data written + * @throws RocksDBException if {@code put} or {@code merge} fail + */ + private long createIntOverflowValue( + final RocksDB db, final ColumnFamilyHandle cf, final String key) throws RocksDBException { + final int BUFSIZE = 100000000; + final int BUFCOUNT = 30; + final byte[] wbuf = new byte[BUFSIZE]; + Arrays.fill(wbuf, (byte) 10); + for (int i = 0; i < BUFCOUNT; i++) { + final byte[] vals = ("value" + i + "ForKey" + key).getBytes(); + System.arraycopy(vals, 0, wbuf, 0, vals.length); + db.merge(cf, "key1".getBytes(), wbuf); + } + return ((long) BUFSIZE + 1) * BUFCOUNT - 1; + } + + private void checkIntOVerflowValue(final ByteBuffer byteBuffer, final String key) { + final int BUFSIZE = 100000000; + final int BUFCOUNT = 30; + for (int i = 0; i < BUFCOUNT; i++) { + final byte[] vals = ("value" + i + "ForKey" + key).getBytes(); + final long position = (long) i * (BUFSIZE + 1); + if (position > Integer.MAX_VALUE) + break; + byteBuffer.position((int) position); + for (byte b : vals) { + assertThat(byteBuffer.get()).isEqualTo(b); + } + } + } + + private static ByteBuffer bbDirect(final String s) { + final byte[] bytes = s.getBytes(); + final ByteBuffer byteBuffer = ByteBuffer.allocateDirect(bytes.length); + byteBuffer.put(bytes); + byteBuffer.flip(); + + return byteBuffer; + } + + /** + * Too slow/disk space dependent for CI + * @throws RocksDBException + */ + @Ignore + @Test + public void putBigMultiGetDirect() throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setMergeOperatorName("stringappend"); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final long length = createIntOverflowValue(db, db.getDefaultColumnFamily(), "key1"); + db.put("key2".getBytes(), "value2ForKey2".getBytes()); + + final List byteBufferValues = new ArrayList<>(); + byteBufferValues.add(ByteBuffer.allocateDirect(Integer.MAX_VALUE)); + final List byteBufferKeys = new ArrayList<>(); + byteBufferKeys.add(bbDirect("key1")); + + final List statusList = + db.multiGetByteBuffers(new ReadOptions(), byteBufferKeys, byteBufferValues); + + assertThat(statusList.size()).isEqualTo(1); + final ByteBufferGetStatus status = statusList.get(0); + assertThat(status.status.getCode()).isEqualTo(Status.Code.Incomplete); + + checkIntOVerflowValue(status.value, "key1"); + } + } + + /** + * Too slow/disk space dependent for CI + * @throws RocksDBException + */ + @Ignore + @Test + public void putBigMultiGetDirectCF() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final ColumnFamilyOptions cfOptions = + new ColumnFamilyOptions().setMergeOperatorName("stringappend"); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final List cfDescriptors = new ArrayList<>(0); + cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes(), cfOptions)); + final List cf = db.createColumnFamilies(cfDescriptors); + + final long length = createIntOverflowValue(db, cf.get(0), "key1"); + db.put(cf.get(0), "key2".getBytes(), "value2ForKey2".getBytes()); + + final List byteBufferValues = new ArrayList<>(); + byteBufferValues.add(ByteBuffer.allocateDirect(Integer.MAX_VALUE)); + final List byteBufferKeys = new ArrayList<>(); + byteBufferKeys.add(bbDirect("key1")); + + final List columnFamilyHandles = new ArrayList<>(); + columnFamilyHandles.add(cf.get(0)); + + final List statusList = db.multiGetByteBuffers( + new ReadOptions(), columnFamilyHandles, byteBufferKeys, byteBufferValues); + + assertThat(statusList.size()).isEqualTo(1); + final ByteBufferGetStatus status = statusList.get(0); + assertThat(status.status.getCode()).isEqualTo(Status.Code.Incomplete); + + checkIntOVerflowValue(status.value, "key1"); + } + } + + /** + * Too slow/disk space dependent for CI + * @throws RocksDBException + */ + @Ignore + @Test + public void putBigMultiGetDirect2Keys() throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setMergeOperatorName("stringappend"); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final long length = createIntOverflowValue(db, db.getDefaultColumnFamily(), "key1"); + db.put("key2".getBytes(), "value2ForKey2".getBytes()); + + final List byteBufferValues = new ArrayList<>(); + byteBufferValues.add(ByteBuffer.allocateDirect(Integer.MAX_VALUE)); + byteBufferValues.add(ByteBuffer.allocateDirect(12)); + final List byteBufferKeys = new ArrayList<>(); + byteBufferKeys.add(bbDirect("key1")); + byteBufferKeys.add(bbDirect("key2")); + + final List statusList = + db.multiGetByteBuffers(new ReadOptions(), byteBufferKeys, byteBufferValues); + + assertThat(statusList.size()).isEqualTo(2); + assertThat(statusList.get(0).status.getCode()).isEqualTo(Status.Code.Incomplete); + checkIntOVerflowValue(statusList.get(0).value, "key1"); + + assertThat(statusList.get(1).status.getCode()).isEqualTo(Status.Code.Ok); + final ByteBuffer bbKey2 = statusList.get(1).value; + final byte[] bytes = new byte[bbKey2.capacity()]; + bbKey2.get(bytes); + assertThat(bytes).isEqualTo("value2ForKey".getBytes()); + } + } + + /** + * Too slow/disk space dependent for CI + * @throws RocksDBException + */ + @Ignore + @Test + public void putBigMultiGetDirect2KeysCF() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final ColumnFamilyOptions cfOptions = + new ColumnFamilyOptions().setMergeOperatorName("stringappend"); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final List cfDescriptors = new ArrayList<>(0); + cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes(), cfOptions)); + final List cf = db.createColumnFamilies(cfDescriptors); + + final long length = createIntOverflowValue(db, cf.get(0), "key1"); + db.put(cf.get(0), "key2".getBytes(), "value2ForKey2".getBytes()); + + final List byteBufferValues = new ArrayList<>(); + byteBufferValues.add(ByteBuffer.allocateDirect(Integer.MAX_VALUE)); + byteBufferValues.add(ByteBuffer.allocateDirect(12)); + final List byteBufferKeys = new ArrayList<>(); + byteBufferKeys.add(bbDirect("key1")); + byteBufferKeys.add(bbDirect("key2")); + + final List columnFamilyHandles = new ArrayList<>(); + columnFamilyHandles.add(cf.get(0)); + + final List statusList = db.multiGetByteBuffers( + new ReadOptions(), columnFamilyHandles, byteBufferKeys, byteBufferValues); + + assertThat(statusList.size()).isEqualTo(2); + assertThat(statusList.get(0).status.getCode()).isEqualTo(Status.Code.Incomplete); + checkIntOVerflowValue(statusList.get(0).value, "key1"); + + assertThat(statusList.get(1).status.getCode()).isEqualTo(Status.Code.Ok); + final ByteBuffer bbKey2 = statusList.get(1).value; + final byte[] bytes = new byte[bbKey2.capacity()]; + bbKey2.get(bytes); + assertThat(bytes).isEqualTo("value2ForKey".getBytes()); + } + } + + /** + * Too slow/disk space dependent for CI + * @throws RocksDBException + */ + @Ignore + @Test + public void putBigMultiGetAsList() throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setMergeOperatorName("stringappend"); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final long length = createIntOverflowValue(db, db.getDefaultColumnFamily(), "key1"); + db.put("key2".getBytes(), "value2ForKey2".getBytes()); + + final List keys = new ArrayList<>(); + keys.add("key1".getBytes()); + assertThatThrownBy(() -> { db.multiGetAsList(keys); }) + .isInstanceOf(RocksDBException.class) + .hasMessageContaining("Requested array size exceeds VM limit"); + } + } + + /** + * Too slow/disk space dependent for CI + * @throws RocksDBException + */ + @Ignore + @Test + public void putBigMultiGetAsListCF() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final ColumnFamilyOptions cfOptions = + new ColumnFamilyOptions().setMergeOperatorName("stringappend"); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final List cfDescriptors = new ArrayList<>(0); + cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes(), cfOptions)); + final List cf = db.createColumnFamilies(cfDescriptors); + + final long length = createIntOverflowValue(db, cf.get(0), "key1"); + db.put(cf.get(0), "key2".getBytes(), "value2ForKey2".getBytes()); + + final List keys = new ArrayList<>(); + keys.add("key1".getBytes()); + assertThatThrownBy(() -> { db.multiGetAsList(cf, keys); }) + .isInstanceOf(RocksDBException.class) + .hasMessageContaining("Requested array size exceeds VM limit"); + } + } + + /** + * Too slow/disk space dependent for CI + * @throws RocksDBException + */ + @Ignore + @Test + public void putBigMultiGetAsList2Keys() throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setMergeOperatorName("stringappend"); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final long length = createIntOverflowValue(db, db.getDefaultColumnFamily(), "key1"); + db.put("key2".getBytes(), "value2ForKey2".getBytes()); + + final List keys = new ArrayList<>(); + keys.add("key2".getBytes()); + keys.add("key1".getBytes()); + assertThatThrownBy(() -> { db.multiGetAsList(keys); }) + .isInstanceOf(RocksDBException.class) + .hasMessageContaining("Requested array size exceeds VM limit"); + } + } + + /** + * Too slow/disk space dependent for CI + * @throws RocksDBException + */ + @Ignore + @Test + public void putBigMultiGetAsList2KeysCF() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final ColumnFamilyOptions cfOptions = + new ColumnFamilyOptions().setMergeOperatorName("stringappend"); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final List cfDescriptors = new ArrayList<>(0); + cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes(), cfOptions)); + final List cf = db.createColumnFamilies(cfDescriptors); + + final long length = createIntOverflowValue(db, cf.get(0), "key1"); + db.put(cf.get(0), "key2".getBytes(), "value2ForKey2".getBytes()); + + final List columnFamilyHandles = new ArrayList<>(); + columnFamilyHandles.add(cf.get(0)); + columnFamilyHandles.add(cf.get(0)); + + final List keys = new ArrayList<>(); + keys.add("key2".getBytes()); + keys.add("key1".getBytes()); + assertThatThrownBy(() -> { db.multiGetAsList(columnFamilyHandles, keys); }) + .isInstanceOf(RocksDBException.class) + .hasMessageContaining("Requested array size exceeds VM limit"); + } + } + + /** + * This eventually doesn't throw as expected + * At about 3rd loop of asking (on a 64GB M1 Max Mac) + * I presume it's a legitimate space exhaustion error in RocksDB, + * but I think it worth having this here as a record. + * + * @throws RocksDBException + */ + @Test + @Ignore + public void putBigMultiGetAsListRepeat() throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setMergeOperatorName("stringappend"); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final long length = createIntOverflowValue(db, db.getDefaultColumnFamily(), "key1"); + db.put("key2".getBytes(), "value2ForKey2".getBytes()); + + final int REPEAT = 10; + for (int i = 0; i < REPEAT; i++) { + final List keys = new ArrayList<>(); + keys.add("key1".getBytes()); + assertThatThrownBy(() -> { db.multiGetAsList(keys); }) + .isInstanceOf(RocksDBException.class) + .hasMessageContaining("Requested array size exceeds VM limit"); + } + } + } } diff --git a/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java b/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java index 519b70b1d2f..237fcaf0c8b 100644 --- a/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java +++ b/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java @@ -5,15 +5,15 @@ package org.rocksdb; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; +import static org.assertj.core.api.Assertions.assertThat; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; - -import static org.assertj.core.api.Assertions.assertThat; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; public class OptimisticTransactionDBTest { @@ -56,6 +56,21 @@ public void open_columnFamilies() throws RocksDBException { } } + @Test(expected = IllegalArgumentException.class) + public void open_columnFamilies_no_default() throws RocksDBException { + try (final DBOptions dbOptions = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions myCfOpts = new ColumnFamilyOptions()) { + final List columnFamilyDescriptors = + Collections.singletonList(new ColumnFamilyDescriptor("myCf".getBytes(), myCfOpts)); + + final List columnFamilyHandles = new ArrayList<>(); + + OptimisticTransactionDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles); + } + } + @Test public void beginTransaction() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); @@ -128,4 +143,21 @@ public void baseDB() throws RocksDBException { assertThat(db.isOwningHandle()).isFalse(); } } + + @Test + public void otdbSimpleIterator() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true).setMaxCompactionBytes(0); + final OptimisticTransactionDB otdb = + OptimisticTransactionDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + otdb.put("keyI".getBytes(), "valueI".getBytes()); + try (final RocksIterator iterator = otdb.newIterator()) { + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("keyI".getBytes()); + assertThat(iterator.value()).isEqualTo("valueI".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isFalse(); + } + } + } } diff --git a/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java b/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java index d2f92e1ff26..4959d207bc3 100644 --- a/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java +++ b/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java @@ -373,12 +373,13 @@ public OptimisticTransactionDBContainer startDb() .setCreateIfMissing(true) .setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions defaultColumnFamilyOptions = new ColumnFamilyOptions(); + defaultColumnFamilyOptions.setMergeOperator(new StringAppendOperator("++")); final ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions(); - final List columnFamilyDescriptors = - Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), - new ColumnFamilyDescriptor(TXN_TEST_COLUMN_FAMILY, - columnFamilyOptions)); + columnFamilyOptions.setMergeOperator(new StringAppendOperator("**")); + final List columnFamilyDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, defaultColumnFamilyOptions), + new ColumnFamilyDescriptor(TXN_TEST_COLUMN_FAMILY, columnFamilyOptions)); final List columnFamilyHandles = new ArrayList<>(); final OptimisticTransactionDB optimisticTxnDb; diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java index 4b59464b1e3..9d23167370c 100644 --- a/java/src/test/java/org/rocksdb/OptionsTest.java +++ b/java/src/test/java/org/rocksdb/OptionsTest.java @@ -699,16 +699,6 @@ public void setWriteBufferManagerWithAllowStall() throws RocksDBException { } } - @SuppressWarnings("deprecated") - @Test - public void accessHintOnCompactionStart() { - try (final Options opt = new Options()) { - final AccessHint accessHint = AccessHint.SEQUENTIAL; - opt.setAccessHintOnCompactionStart(accessHint); - assertThat(opt.accessHintOnCompactionStart()).isEqualTo(accessHint); - } - } - @Test public void compactionReadaheadSize() { try (final Options opt = new Options()) { @@ -1496,4 +1486,20 @@ public void onMemTableSealed(final MemTableInfo memTableInfo) { assertEquals(0, listeners2.size()); } } + @Test + public void tablePropertiesCollectorFactory() { + try (final Options options = new Options()) { + try (TablePropertiesCollectorFactory collectorFactory = + TablePropertiesCollectorFactory.NewCompactOnDeletionCollectorFactory(10, 10, 1.0)) { + List factories = Arrays.asList(collectorFactory); + options.setTablePropertiesCollectorFactory(factories); + } + List factories = options.tablePropertiesCollectorFactory(); + try { + assertThat(factories).hasSize(1); + } finally { + factories.stream().forEach(TablePropertiesCollectorFactory::close); + } + } + } } diff --git a/java/src/test/java/org/rocksdb/OptionsUtilTest.java b/java/src/test/java/org/rocksdb/OptionsUtilTest.java index 23949ac0662..e14fb58fa69 100644 --- a/java/src/test/java/org/rocksdb/OptionsUtilTest.java +++ b/java/src/test/java/org/rocksdb/OptionsUtilTest.java @@ -358,6 +358,7 @@ private void verifyBlockBasedTableConfig( assertThat(actual.metadataBlockSize()).isEqualTo(expected.metadataBlockSize()); assertThat(actual.partitionFilters()).isEqualTo(expected.partitionFilters()); assertThat(actual.optimizeFiltersForMemory()).isEqualTo(expected.optimizeFiltersForMemory()); + assertThat(actual.useDeltaEncoding()).isEqualTo(expected.useDeltaEncoding()); assertThat(actual.wholeKeyFiltering()).isEqualTo(expected.wholeKeyFiltering()); assertThat(actual.verifyCompression()).isEqualTo(expected.verifyCompression()); assertThat(actual.readAmpBytesPerBit()).isEqualTo(expected.readAmpBytesPerBit()); @@ -370,9 +371,5 @@ private void verifyBlockBasedTableConfig( } else { assertThat(expected.filterPolicy().equals(actual.filterPolicy())); } - - // not currently persisted - always true when read from options - // this test will fail, and need repaired, if and when "useDeltaEncoding" is persisted. - assertThat(actual.useDeltaEncoding()).isEqualTo(true); } } diff --git a/java/src/test/java/org/rocksdb/PerfContextTest.java b/java/src/test/java/org/rocksdb/PerfContextTest.java index 3145b59e439..5be1c47e928 100644 --- a/java/src/test/java/org/rocksdb/PerfContextTest.java +++ b/java/src/test/java/org/rocksdb/PerfContextTest.java @@ -17,6 +17,7 @@ import java.util.List; import org.junit.*; import org.junit.rules.TemporaryFolder; +import org.rocksdb.util.Environment; public class PerfContextTest { @ClassRule @@ -80,6 +81,8 @@ public void testAllGetters() throws RocksDBException, IntrospectionException, @Test public void testGetBlockReadCpuTime() throws RocksDBException { + Assume.assumeFalse("We are not running this test on OpenBSD or Windows", + Environment.isOpenBSD() || Environment.isWindows()); db.setPerfLevel(PerfLevel.ENABLE_TIME_AND_CPU_TIME_EXCEPT_FOR_MUTEX); db.put("key".getBytes(), "value".getBytes()); db.compactRange(); diff --git a/java/src/test/java/org/rocksdb/PutCFVariantsTest.java b/java/src/test/java/org/rocksdb/PutCFVariantsTest.java new file mode 100644 index 00000000000..977c74dc8ea --- /dev/null +++ b/java/src/test/java/org/rocksdb/PutCFVariantsTest.java @@ -0,0 +1,126 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.rocksdb.MergeTest.longFromByteArray; +import static org.rocksdb.MergeTest.longToByteArray; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +@RunWith(Parameterized.class) +public class PutCFVariantsTest { + @FunctionalInterface + interface FunctionCFPut { + public void apply(PDatabase db, PColumnFamilyHandle cfh, PLeft two, PRight three) + throws RocksDBException; + } + + @Parameterized.Parameters + public static List> + data() { + return Arrays.asList(RocksDB::put, + (db, cfh, left, right) + -> db.put(cfh, new WriteOptions(), left, right), + (db, cfh, left, right) + -> { + final byte[] left0 = + ("1234567" + new String(left, StandardCharsets.UTF_8) + "890").getBytes(); + final byte[] right0 = + ("1234" + new String(right, StandardCharsets.UTF_8) + "567890ab").getBytes(); + db.put(cfh, left0, 7, left.length, right0, 4, right.length); + }, + (db, cfh, left, right) + -> { + final byte[] left0 = + ("1234567" + new String(left, StandardCharsets.UTF_8) + "890").getBytes(); + final byte[] right0 = + ("1234" + new String(right, StandardCharsets.UTF_8) + "567890ab").getBytes(); + db.put(cfh, new WriteOptions(), left0, 7, left.length, right0, 4, right.length); + }, + + (db, cfh, left, right) + -> { + final ByteBuffer bbLeft = ByteBuffer.allocateDirect(100); + final ByteBuffer bbRight = ByteBuffer.allocateDirect(100); + bbLeft.put(left).flip(); + bbRight.put(right).flip(); + db.put(cfh, new WriteOptions(), bbLeft, bbRight); + }, + (db, cfh, left, right) -> { + final ByteBuffer bbLeft = ByteBuffer.allocate(100); + final ByteBuffer bbRight = ByteBuffer.allocate(100); + bbLeft.put(left).flip(); + bbRight.put(right).flip(); + db.put(cfh, new WriteOptions(), bbLeft, bbRight); + }); + } + + @Parameterized.Parameter + public PutCFVariantsTest.FunctionCFPut putFunction; + + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void writeAndRead() throws InterruptedException, RocksDBException { + try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator(); + final ColumnFamilyOptions cfOpt1 = + new ColumnFamilyOptions().setMergeOperator(uint64AddOperator); + final ColumnFamilyOptions cfOpt2 = + new ColumnFamilyOptions().setMergeOperator(uint64AddOperator)) { + final List cfDescriptors = + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt1), + new ColumnFamilyDescriptor("new_cf".getBytes(), cfOpt2)); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions opt = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open( + opt, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { + try { + // writing (long)100 under key + putFunction.apply( + db, columnFamilyHandleList.get(1), "cfkey".getBytes(), longToByteArray(100)); + // merge (long)1 under key + byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes()); + final long longValue = longFromByteArray(value); + + // Test also with createColumnFamily + try (final ColumnFamilyOptions cfHandleOpts = + new ColumnFamilyOptions().setMergeOperator(uint64AddOperator); + final ColumnFamilyHandle cfHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("new_cf2".getBytes(), cfHandleOpts))) { + // writing (long)200 under cfkey2 + db.put(cfHandle, "cfkey2".getBytes(), longToByteArray(200)); + // merge (long)50 under cfkey2 + value = db.get(cfHandle, "cfkey2".getBytes()); + final long longValueTmpCf = longFromByteArray(value); + + assertThat(longValue).isEqualTo(100); + assertThat(longValueTmpCf).isEqualTo(200); + } + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { + columnFamilyHandle.close(); + } + } + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/PutVariantsTest.java b/java/src/test/java/org/rocksdb/PutVariantsTest.java new file mode 100644 index 00000000000..2e0e9b9e364 --- /dev/null +++ b/java/src/test/java/org/rocksdb/PutVariantsTest.java @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.rocksdb.MergeTest.longFromByteArray; +import static org.rocksdb.MergeTest.longToByteArray; + +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +@RunWith(Parameterized.class) +public class PutVariantsTest { + @FunctionalInterface + interface FunctionPut { + public void apply(PDatabase db, PLeft two, PRight three) throws RocksDBException; + } + + @Parameterized.Parameters + public static List> data() { + return Arrays.asList(RocksDB::put, + (db, left, right) + -> db.put(new WriteOptions(), left, right), + (db, left, right) + -> { + final byte[] left0 = + ("1234567" + new String(left, StandardCharsets.UTF_8) + "890").getBytes(); + final byte[] right0 = + ("1234" + new String(right, StandardCharsets.UTF_8) + "567890ab").getBytes(); + db.put(left0, 7, left.length, right0, 4, right.length); + }, + (db, left, right) + -> { + final byte[] left0 = + ("1234567" + new String(left, StandardCharsets.UTF_8) + "890").getBytes(); + final byte[] right0 = + ("1234" + new String(right, StandardCharsets.UTF_8) + "567890ab").getBytes(); + db.put(new WriteOptions(), left0, 7, left.length, right0, 4, right.length); + }, + (db, left, right) + -> { + final ByteBuffer bbLeft = ByteBuffer.allocateDirect(100); + final ByteBuffer bbRight = ByteBuffer.allocateDirect(100); + bbLeft.put(left).flip(); + bbRight.put(right).flip(); + db.put(new WriteOptions(), bbLeft, bbRight); + }, + (db, left, right) -> { + final ByteBuffer bbLeft = ByteBuffer.allocate(100); + final ByteBuffer bbRight = ByteBuffer.allocate(100); + bbLeft.put(left).flip(); + bbRight.put(right).flip(); + db.put(new WriteOptions(), bbLeft, bbRight); + }); + } + + @Parameterized.Parameter public PutVariantsTest.FunctionPut putFunction; + + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void writeAndRead() throws InterruptedException, RocksDBException { + try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator(); + final Options opt = + new Options().setCreateIfMissing(true).setMergeOperator(uint64AddOperator); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + // Writing (long)100 under key + putFunction.apply(db, "key".getBytes(), longToByteArray(100)); + + final byte[] value = db.get("key".getBytes()); + final long longValue = longFromByteArray(value); + + assertThat(longValue).isEqualTo(100); + } + } +} diff --git a/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/java/src/test/java/org/rocksdb/ReadOptionsTest.java index 1bc24b98449..baf51bf9b4b 100644 --- a/java/src/test/java/org/rocksdb/ReadOptionsTest.java +++ b/java/src/test/java/org/rocksdb/ReadOptionsTest.java @@ -255,6 +255,14 @@ public void valueSizeSoftLimit() { } } + @Test + public void asyncIo() { + try (final ReadOptions opt = new ReadOptions()) { + opt.setAsyncIo(true); + assertThat(opt.asyncIo()).isTrue(); + } + } + @Test public void failSetVerifyChecksumUninitialized() { try (final ReadOptions readOptions = diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java index ed6e989a8d4..1459f03b05a 100644 --- a/java/src/test/java/org/rocksdb/RocksDBTest.java +++ b/java/src/test/java/org/rocksdb/RocksDBTest.java @@ -184,10 +184,8 @@ public void put() throws RocksDBException { final WriteOptions opt = new WriteOptions(); final ReadOptions optr = new ReadOptions()) { db.put("key1".getBytes(), "value".getBytes()); db.put(opt, "key2".getBytes(), "12345678".getBytes()); - assertThat(db.get("key1".getBytes())).isEqualTo( - "value".getBytes()); - assertThat(db.get("key2".getBytes())).isEqualTo( - "12345678".getBytes()); + assertThat(db.get("key1".getBytes())).isEqualTo("value".getBytes()); + assertThat(db.get("key2".getBytes())).isEqualTo("12345678".getBytes()); final ByteBuffer key = ByteBuffer.allocateDirect(12); final ByteBuffer value = ByteBuffer.allocateDirect(12); @@ -221,15 +219,31 @@ public void put() throws RocksDBException { key.position(4); + final ByteBuffer result2 = ByteBuffer.allocateDirect(12); + result2.put("abcdefghijkl".getBytes()); + result2.flip().position(3); + assertThat(db.get(optr, key, result2)).isEqualTo(4); + assertThat(result2.position()).isEqualTo(3); + assertThat(result2.limit()).isEqualTo(7); + assertThat(key.position()).isEqualTo(8); + assertThat(key.limit()).isEqualTo(8); + + final byte[] tmp2 = new byte[12]; + result2.position(0).limit(12); + result2.get(tmp2); + assertThat(tmp2).isEqualTo("abcval3hijkl".getBytes()); + + key.position(4); + result.clear().position(9); assertThat(db.get(optr, key, result)).isEqualTo(4); assertThat(result.position()).isEqualTo(9); assertThat(result.limit()).isEqualTo(12); assertThat(key.position()).isEqualTo(8); assertThat(key.limit()).isEqualTo(8); - final byte[] tmp2 = new byte[3]; - result.get(tmp2); - assertThat(tmp2).isEqualTo("val".getBytes()); + final byte[] tmp3 = new byte[3]; + result.get(tmp3); + assertThat(tmp3).isEqualTo("val".getBytes()); // put final Segment key3 = sliceSegment("key3"); @@ -245,8 +259,73 @@ public void put() throws RocksDBException { } } - private static Segment sliceSegment(final String key) { - final ByteBuffer rawKey = ByteBuffer.allocate(key.length() + 4); + @Test + public void putIndirectByteBuffers() throws RocksDBException { + try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + final WriteOptions opt = new WriteOptions(); final ReadOptions optr = new ReadOptions()) { + db.put("key1".getBytes(), "value".getBytes()); + db.put(opt, "key2".getBytes(), "12345678".getBytes()); + assertThat(db.get("key1".getBytes())).isEqualTo("value".getBytes()); + assertThat(db.get("key2".getBytes())).isEqualTo("12345678".getBytes()); + + ByteBuffer key = ByteBuffer.allocate(12); + ByteBuffer value = ByteBuffer.allocate(12); + key.position(4); + key.put("key3".getBytes()); + key.position(4).limit(8); + value.position(4); + value.put("val3".getBytes()); + value.position(4).limit(8); + + db.put(opt, key, value); + + assertThat(key.position()).isEqualTo(8); + assertThat(key.limit()).isEqualTo(8); + + assertThat(value.position()).isEqualTo(8); + assertThat(value.limit()).isEqualTo(8); + + key.position(4); + + ByteBuffer result = ByteBuffer.allocate(12); + assertThat(db.get(optr, key, result)).isEqualTo(4); + assertThat(result.position()).isEqualTo(0); + assertThat(result.limit()).isEqualTo(4); + assertThat(key.position()).isEqualTo(8); + assertThat(key.limit()).isEqualTo(8); + + byte[] tmp = new byte[4]; + result.get(tmp); + assertThat(tmp).isEqualTo("val3".getBytes()); + + key.position(4); + + result.clear().position(9); + assertThat(db.get(optr, key, result)).isEqualTo(4); + assertThat(result.position()).isEqualTo(9); + assertThat(result.limit()).isEqualTo(12); + assertThat(key.position()).isEqualTo(8); + assertThat(key.limit()).isEqualTo(8); + byte[] tmp2 = new byte[3]; + result.get(tmp2); + assertThat(tmp2).isEqualTo("val".getBytes()); + + // put + Segment key3 = sliceSegment("key3"); + Segment key4 = sliceSegment("key4"); + Segment value0 = sliceSegment("value 0"); + Segment value1 = sliceSegment("value 1"); + db.put(key3.data, key3.offset, key3.len, value0.data, value0.offset, value0.len); + db.put(opt, key4.data, key4.offset, key4.len, value1.data, value1.offset, value1.len); + + // compare + Assert.assertTrue(value0.isSamePayload(db.get(key3.data, key3.offset, key3.len))); + Assert.assertTrue(value1.isSamePayload(db.get(key4.data, key4.offset, key4.len))); + } + } + + private static Segment sliceSegment(String key) { + ByteBuffer rawKey = ByteBuffer.allocate(key.length() + 4); rawKey.put((byte)0); rawKey.put((byte)0); rawKey.put(key.getBytes()); @@ -1073,6 +1152,40 @@ public void compactRangeToLevelColumnFamily() } } + @Test + public void compactRangeWithNullBoundaries() throws RocksDBException { + try (final Options opt = new Options() + .setCreateIfMissing(true) + .setDisableAutoCompactions(true) + .setCompactionStyle(CompactionStyle.LEVEL) + .setNumLevels(4) + .setWriteBufferSize(100 << 10) + .setLevelZeroFileNumCompactionTrigger(3) + .setTargetFileSizeBase(200 << 10) + .setTargetFileSizeMultiplier(1) + .setMaxBytesForLevelBase(500 << 10) + .setMaxBytesForLevelMultiplier(1) + .setDisableAutoCompactions(true); + final FlushOptions flushOptions = new FlushOptions(); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final byte[] b = new byte[10000]; + // Create an SST containing key4, key5, and key6 + db.put(("key4").getBytes(), b); + db.put(("key5").getBytes(), b); + db.put(("key6").getBytes(), b); + db.flush(flushOptions); + // Create a new SST that includes the tombstones of all keys + db.delete(("key4").getBytes()); + db.delete(("key5").getBytes()); + db.delete(("key6").getBytes()); + db.flush(flushOptions); + + db.compactRange(("key4").getBytes(), null); + List liveFilesMetaData = db.getLiveFilesMetaData(); + assertThat(liveFilesMetaData.size()).isEqualTo(0); + } + } + @Test public void continueBackgroundWorkAfterCancelAllBackgroundWork() throws RocksDBException { final int KEY_SIZE = 20; @@ -1147,9 +1260,7 @@ public void enableDisableFileDeletions() throws RocksDBException { dbFolder.getRoot().getAbsolutePath()) ) { db.disableFileDeletions(); - db.enableFileDeletions(false); - db.disableFileDeletions(); - db.enableFileDeletions(true); + db.enableFileDeletions(); } } @@ -1284,6 +1395,61 @@ public void getApproximateMemTableStatsSingleKey() throws RocksDBException { } } + @Test + public void getLiveFilesMetadataWithChecksum() throws RocksDBException { + final Properties props = new Properties(); + final byte[] key1 = "key1".getBytes(UTF_8); + props.put("file_checksum_gen_factory", "FileChecksumGenCrc32cFactory"); + + try (final DBOptions dbOptions = DBOptions.getDBOptionsFromProps(props); + final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions(); + final Options options = new Options(dbOptions, cfOptions).setCreateIfMissing(true)) { + final String dbPath = dbFolder.getRoot().getAbsolutePath(); + + // disable WAL so we have a deterministic checksum + try (final RocksDB db = RocksDB.open(options, dbPath); + final WriteOptions writeOptions = new WriteOptions().setDisableWAL(true)) { + db.put(writeOptions, key1, key1); + } + + try (final RocksDB db = RocksDB.open(options, dbPath)) { + final List expectedFileMetadata = db.getLiveFilesMetaData(); + assertThat(expectedFileMetadata).hasSize(1); + // ideally we could re-compute here, but CRC32C is a Java 9 feature, so we have no CRC32C + // implementation available here + final LiveFileMetaData sstFile = expectedFileMetadata.get(0); + assertThat(sstFile.fileChecksum()).isNotEmpty(); + } + } + } + + @Test + public void getColumnFamilyMetadataWithChecksum() throws RocksDBException { + final Properties props = new Properties(); + props.put("file_checksum_gen_factory", "FileChecksumGenCrc32cFactory"); + final String dbPath = dbFolder.getRoot().getAbsolutePath(); + + try (final DBOptions dbOptions = DBOptions.getDBOptionsFromProps(props); + final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions(); + final Options options = new Options(dbOptions, cfOptions).setCreateIfMissing(true)) { + try (final RocksDB db = RocksDB.open(options, dbPath); + final WriteOptions writeOptions = new WriteOptions().setDisableWAL(true)) { + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + } + + try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + ColumnFamilyMetaData metadata = db.getColumnFamilyMetaData(); // Exception here + List levels = metadata.levels(); + assertThat(levels).isNotEmpty(); + List filesMetadata = levels.get(0).files(); + assertThat(filesMetadata).isNotEmpty(); + assertThat(filesMetadata.get(0).fileChecksum()).isNotNull(); + assertThat(filesMetadata.get(0).fileChecksum()).hasSize(4); + assertThat(filesMetadata.get(0).fileChecksum()).isNotEqualTo(new byte[] {0, 0, 0, 0}); + } + } + } + @Ignore("TODO(AR) re-enable when ready!") @Test public void compactFiles() throws RocksDBException { @@ -1738,6 +1904,14 @@ public void rocksdbVersion() { assertThat(version.getMajor()).isGreaterThan(1); } + @Test + public void isClosed() throws RocksDBException { + final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + assertThat(db.isClosed()).isFalse(); + db.close(); + assertThat(db.isClosed()).isTrue(); + } + private static class InMemoryTraceWriter extends AbstractTraceWriter { private final List writes = new ArrayList<>(); private volatile boolean closed = false; diff --git a/java/src/test/java/org/rocksdb/RocksIteratorTest.java b/java/src/test/java/org/rocksdb/RocksIteratorTest.java index 2a13550b735..bbbb9e2e5e7 100644 --- a/java/src/test/java/org/rocksdb/RocksIteratorTest.java +++ b/java/src/test/java/org/rocksdb/RocksIteratorTest.java @@ -5,6 +5,7 @@ package org.rocksdb; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.fail; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; @@ -45,7 +46,7 @@ private void validateValue( } @Test - public void rocksIterator() throws RocksDBException { + public void rocksIteratorByteBuffers() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { @@ -72,6 +73,103 @@ public void rocksIterator() throws RocksDBException { validateKey(iterator, ByteBuffer.allocate(5), "key1"); validateValue(iterator, ByteBuffer.allocate(2), "value1"); validateValue(iterator, ByteBuffer.allocate(8), "value1"); + } + } + } + + @Test + public void rocksIteratorByteArrayValues() throws RocksDBException { + try (final Options options = + new Options().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1".getBytes()); + db.put("key2".getBytes(), "value2".getBytes()); + + try (final RocksIterator iterator = db.newIterator()) { + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key1".getBytes()); + assertThat(iterator.value()).isEqualTo("value1".getBytes()); + + final byte[] valueArray0 = new byte[2]; + assertThat(iterator.value(valueArray0)).isEqualTo(6); + assertThat(valueArray0).isEqualTo("va".getBytes()); + final byte[] valueArray1 = new byte[8]; + assertThat(iterator.value(valueArray1)).isEqualTo(6); + assertThat(valueArray1).isEqualTo("value1\0\0".getBytes()); + final byte[] valueArray2 = new byte[10]; + assertThat(iterator.value(valueArray2, 2, 6)).isEqualTo(6); + assertThat(valueArray2).isEqualTo("\0\0value1\0\0".getBytes()); + final byte[] valueArray3 = new byte[10]; + assertThat(iterator.value(valueArray3, 5, 5)).isEqualTo(6); + assertThat(valueArray3).isEqualTo("\0\0\0\0\0value".getBytes()); + final byte[] valueArray4 = new byte[6]; + try { + iterator.value(valueArray4, 1, 6); + fail("Expected IndexOutOfBoundsException"); + } catch (final IndexOutOfBoundsException ignored) { + // we should arrive here + } + final byte[] valueArray5 = new byte[7]; + assertThat(iterator.value(valueArray5, 1, 6)).isEqualTo(6); + assertThat(valueArray5).isEqualTo("\0value1".getBytes()); + } + } + } + + @Test + public void rocksIteratorByteArrayKeys() throws RocksDBException { + try (final Options options = + new Options().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1".getBytes()); + db.put("key2".getBytes(), "value2".getBytes()); + + try (final RocksIterator iterator = db.newIterator()) { + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key1".getBytes()); + assertThat(iterator.value()).isEqualTo("value1".getBytes()); + + final byte[] keyArray0 = new byte[2]; + assertThat(iterator.key(keyArray0)).isEqualTo(4); + assertThat(keyArray0).isEqualTo("ke".getBytes()); + final byte[] keyArray1 = new byte[8]; + assertThat(iterator.key(keyArray1)).isEqualTo(4); + assertThat(keyArray1).isEqualTo("key1\0\0\0\0".getBytes()); + final byte[] keyArray2 = new byte[10]; + assertThat(iterator.key(keyArray2, 2, 6)).isEqualTo(4); + assertThat(keyArray2).isEqualTo("\0\0key1\0\0\0\0".getBytes()); + final byte[] keyArray3 = new byte[10]; + assertThat(iterator.key(keyArray3, 5, 3)).isEqualTo(4); + assertThat(keyArray3).isEqualTo("\0\0\0\0\0key\0\0".getBytes()); + final byte[] keyArray4 = new byte[4]; + try { + iterator.key(keyArray4, 1, 4); + fail("Expected IndexOutOfBoundsException"); + } catch (final IndexOutOfBoundsException ignored) { + // we should arrive here + } + final byte[] keyArray5 = new byte[5]; + assertThat(iterator.key(keyArray5, 1, 4)).isEqualTo(4); + assertThat(keyArray5).isEqualTo("\0key1".getBytes()); + } + } + } + + @Test + public void rocksIteratorSimple() throws RocksDBException { + try (final Options options = + new Options().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1".getBytes()); + db.put("key2".getBytes(), "value2".getBytes()); + + try (final RocksIterator iterator = db.newIterator()) { + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key1".getBytes()); + assertThat(iterator.value()).isEqualTo("value1".getBytes()); iterator.next(); assertThat(iterator.isValid()).isTrue(); @@ -90,6 +188,23 @@ public void rocksIterator() throws RocksDBException { assertThat(iterator.key()).isEqualTo("key2".getBytes()); assertThat(iterator.value()).isEqualTo("value2".getBytes()); iterator.status(); + } + } + } + + @Test + public void rocksIterator() throws RocksDBException { + try (final Options options = + new Options().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1".getBytes()); + db.put("key2".getBytes(), "value2".getBytes()); + + try (final RocksIterator iterator = db.newIterator()) { + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key1".getBytes()); + assertThat(iterator.value()).isEqualTo("value1".getBytes()); { final ByteBuffer key = ByteBuffer.allocate(12); @@ -236,6 +351,69 @@ public void rocksIteratorSeekAndInsert() throws RocksDBException { } } + @Test + public void rocksIteratorSeekAndInsertOnSnapshot() throws RocksDBException { + try (final Options options = + new Options().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1".getBytes()); + db.put("key2".getBytes(), "value2".getBytes()); + + try (final Snapshot snapshot = db.getSnapshot()) { + try (final RocksIterator iterator = db.newIterator()) { + // check for just keys 1 and 2 + iterator.seek("key0".getBytes()); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key1".getBytes()); + + iterator.seek("key2".getBytes()); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key2".getBytes()); + + iterator.seek("key3".getBytes()); + assertThat(iterator.isValid()).isFalse(); + } + + // add a new key (after the snapshot was taken) + db.put("key3".getBytes(), "value3".getBytes()); + + try (final RocksIterator iterator = db.newIterator()) { + // check for keys 1, 2, and 3 + iterator.seek("key0".getBytes()); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key1".getBytes()); + + iterator.seek("key2".getBytes()); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key2".getBytes()); + + iterator.seek("key3".getBytes()); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key3".getBytes()); + + iterator.seek("key4".getBytes()); + assertThat(iterator.isValid()).isFalse(); + + // reset iterator to snapshot, iterator should now only see keys + // there were present in the db when the snapshot was taken + iterator.refresh(snapshot); + + // again check for just keys 1 and 2 + iterator.seek("key0".getBytes()); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key1".getBytes()); + + iterator.seek("key2".getBytes()); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key2".getBytes()); + + iterator.seek("key3".getBytes()); + assertThat(iterator.isValid()).isFalse(); + } + } + } + } + @Test public void rocksIteratorReleaseAfterCfClose() throws RocksDBException { try (final Options options = new Options() diff --git a/java/src/test/java/org/rocksdb/TransactionDBTest.java b/java/src/test/java/org/rocksdb/TransactionDBTest.java index 56acb21c725..e158c92fdff 100644 --- a/java/src/test/java/org/rocksdb/TransactionDBTest.java +++ b/java/src/test/java/org/rocksdb/TransactionDBTest.java @@ -31,10 +31,9 @@ public void open() throws RocksDBException { @Test public void open_columnFamilies() throws RocksDBException { - try(final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true) - .setCreateMissingColumnFamilies(true); - final ColumnFamilyOptions myCfOpts = new ColumnFamilyOptions()) { - + try (final DBOptions dbOptions = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions myCfOpts = new ColumnFamilyOptions()) { final List columnFamilyDescriptors = Arrays.asList( new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), @@ -57,6 +56,24 @@ public void open_columnFamilies() throws RocksDBException { } } + @Test(expected = IllegalArgumentException.class) + public void open_columnFamilies_no_default() throws RocksDBException { + try (final DBOptions dbOptions = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions myCfOpts = new ColumnFamilyOptions()) { + final List columnFamilyDescriptors = + Collections.singletonList(new ColumnFamilyDescriptor("myCf".getBytes(), myCfOpts)); + + final List columnFamilyHandles = new ArrayList<>(); + + try ( + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB ignored = TransactionDB.open(dbOptions, txnDbOptions, + dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles)) { + } + } + } + @Test public void beginTransaction() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); @@ -174,4 +191,22 @@ public void setDeadlockInfoBufferSize() throws RocksDBException { tdb.setDeadlockInfoBufferSize(123); } } + + @Test + public void tdbSimpleIterator() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true).setMaxCompactionBytes(0); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB tdb = + TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath())) { + tdb.put("keyI".getBytes(), "valueI".getBytes()); + try (final RocksIterator iterator = tdb.newIterator()) { + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("keyI".getBytes()); + assertThat(iterator.value()).isEqualTo("valueI".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isFalse(); + } + } + } } diff --git a/java/src/test/java/org/rocksdb/TransactionTest.java b/java/src/test/java/org/rocksdb/TransactionTest.java index b80445c5c80..03a6b4ff6b3 100644 --- a/java/src/test/java/org/rocksdb/TransactionTest.java +++ b/java/src/test/java/org/rocksdb/TransactionTest.java @@ -416,12 +416,13 @@ public TransactionDBContainer startDb() throws RocksDBException { .setCreateIfMissing(true) .setCreateMissingColumnFamilies(true); final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final ColumnFamilyOptions defaultColumnFamilyOptions = new ColumnFamilyOptions(); + defaultColumnFamilyOptions.setMergeOperator(new StringAppendOperator("++")); final ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions(); - final List columnFamilyDescriptors = - Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), - new ColumnFamilyDescriptor(TXN_TEST_COLUMN_FAMILY, - columnFamilyOptions)); + columnFamilyOptions.setMergeOperator(new StringAppendOperator("**")); + final List columnFamilyDescriptors = Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, defaultColumnFamilyOptions), + new ColumnFamilyDescriptor(TXN_TEST_COLUMN_FAMILY, columnFamilyOptions)); final List columnFamilyHandles = new ArrayList<>(); final TransactionDB txnDb; diff --git a/java/src/test/java/org/rocksdb/TtlDBTest.java b/java/src/test/java/org/rocksdb/TtlDBTest.java index ebf9e9eaa3b..abae3a44a77 100644 --- a/java/src/test/java/org/rocksdb/TtlDBTest.java +++ b/java/src/test/java/org/rocksdb/TtlDBTest.java @@ -5,19 +5,20 @@ package org.rocksdb; -import org.junit.ClassRule; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; +import static org.assertj.core.api.Assertions.assertThat; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.concurrent.TimeUnit; - -import static org.assertj.core.api.Assertions.assertThat; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; public class TtlDBTest { + private static final int BATCH_ITERATION = 16; @ClassRule public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = @@ -50,6 +51,22 @@ public void ttlDBOpenWithTtl() throws RocksDBException, InterruptedException { } } + @Test + public void ttlDBSimpleIterator() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true).setMaxCompactionBytes(0); + final TtlDB ttlDB = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + ttlDB.put("keyI".getBytes(), "valueI".getBytes()); + try (final RocksIterator iterator = ttlDB.newIterator()) { + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("keyI".getBytes()); + assertThat(iterator.value()).isEqualTo("valueI".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isFalse(); + } + } + } + @Test public void ttlDbOpenWithColumnFamilies() throws RocksDBException, InterruptedException { @@ -109,4 +126,77 @@ public void createTtlColumnFamily() throws RocksDBException, assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).isNull(); } } + + @Test + public void writeBatchWithFlush() throws RocksDBException { + try (final Options dbOptions = new Options()) { + dbOptions.setCreateIfMissing(true); + dbOptions.setCreateMissingColumnFamilies(true); + + try (final RocksDB db = + TtlDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), 100, false)) { + try (WriteBatch wb = new WriteBatch()) { + for (int i = 0; i < BATCH_ITERATION; i++) { + wb.put(("key" + i).getBytes(StandardCharsets.UTF_8), + ("value" + i).getBytes(StandardCharsets.UTF_8)); + } + try (WriteOptions writeOptions = new WriteOptions()) { + db.write(writeOptions, wb); + } + try (FlushOptions fOptions = new FlushOptions()) { + db.flush(fOptions); + } + } + for (int i = 0; i < BATCH_ITERATION; i++) { + assertThat(db.get(("key" + i).getBytes(StandardCharsets.UTF_8))) + .isEqualTo(("value" + i).getBytes(StandardCharsets.UTF_8)); + } + } + } + } + + @Test + public void writeBatchWithFlushAndColumnFamily() throws RocksDBException { + try (final DBOptions dbOptions = new DBOptions()) { + System.out.println("Test start"); + dbOptions.setCreateIfMissing(true); + dbOptions.setCreateMissingColumnFamilies(true); + + final List cfNames = + Arrays.asList(new ColumnFamilyDescriptor("new_cf".getBytes()), + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List columnFamilyHandleList = new ArrayList<>(); + + final List ttlValues = Arrays.asList(0, 1); + + try (final RocksDB db = TtlDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), cfNames, + columnFamilyHandleList, ttlValues, false)) { + try { + assertThat(columnFamilyHandleList.get(1).isDefaultColumnFamily()).isTrue(); + + try (WriteBatch wb = new WriteBatch()) { + for (int i = 0; i < BATCH_ITERATION; i++) { + wb.put(("key" + i).getBytes(StandardCharsets.UTF_8), + ("value" + i).getBytes(StandardCharsets.UTF_8)); + } + try (WriteOptions writeOptions = new WriteOptions()) { + db.write(writeOptions, wb); + } + try (FlushOptions fOptions = new FlushOptions()) { + // Test both flush options, db.flush(fOptions) slush only default CF + db.flush(fOptions); + db.flush(fOptions, columnFamilyHandleList); + } + } + for (int i = 0; i < BATCH_ITERATION; i++) { + assertThat(db.get(("key" + i).getBytes(StandardCharsets.UTF_8))) + .isEqualTo(("value" + i).getBytes(StandardCharsets.UTF_8)); + } + } finally { + // All CF handles must be closed before we close DB. + columnFamilyHandleList.stream().forEach(ColumnFamilyHandle::close); + } + } + } + } } diff --git a/java/src/test/java/org/rocksdb/VerifyChecksumsTest.java b/java/src/test/java/org/rocksdb/VerifyChecksumsTest.java index ddc2a456fca..4ca32dcab93 100644 --- a/java/src/test/java/org/rocksdb/VerifyChecksumsTest.java +++ b/java/src/test/java/org/rocksdb/VerifyChecksumsTest.java @@ -12,6 +12,7 @@ import java.util.Collections; import java.util.List; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -127,9 +128,9 @@ abstract void performOperations(final RocksDB db, final boolean verifyFlag) /** * Run some operations and count the TickerType.BLOCK_CHECKSUM_COMPUTE_COUNT before and after * It should GO UP when the read options have checksum verification turned on. - * It shoulld REMAIN UNCHANGED when the read options have checksum verification turned off. + * It should REMAIN UNCHANGED when the read options have checksum verification turned off. * As the read options refer only to the read operations, there are still a few checksums - * performed outside this (blocks are getting loaded for lots of reasons, not aways directly due + * performed outside this (blocks are getting loaded for lots of reasons, not always directly due * to reads) but this test provides a good enough proxy for whether the flag is being noticed. * * @param operations the DB reading operations to perform which affect the checksum stats @@ -166,9 +167,10 @@ private void verifyChecksums(final Operations operations) throws RocksDBExceptio statistics.getTickerCount(TickerType.BLOCK_CHECKSUM_COMPUTE_COUNT); if (verifyFlag) { // We don't need to be exact - we are checking that the checksums happen - // exactly how many depends on block size etc etc, so may not be entirely stable + // exactly how many depends on block size, MultiGet batching etc etc, + // so may not be entirely stable System.out.println(MessageFormat.format("verify=true {0}", afterOperationsCount)); - assertThat(afterOperationsCount).isGreaterThan(beforeOperationsCount + 20); + assertThat(afterOperationsCount).isGreaterThan(beforeOperationsCount); } else { System.out.println(MessageFormat.format("verify=false {0}", afterOperationsCount)); assertThat(afterOperationsCount).isEqualTo(beforeOperationsCount); @@ -200,8 +202,11 @@ void performOperations(final RocksDB db, final boolean verifyFlag) throws RocksD }); } + @Ignore( + "The block checksum count looks as if it is not updated when a more optimized C++ multiGet is used.") @Test - public void verifyChecksumsMultiGet() throws RocksDBException { + public void + verifyChecksumsMultiGet() throws RocksDBException { // noinspection AnonymousInnerClassMayBeStatic verifyChecksums(new Operations(KV_COUNT) { @Override diff --git a/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java b/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java index 69f2c282b02..23b20df63de 100644 --- a/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java +++ b/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java @@ -484,6 +484,11 @@ public void refresh() throws RocksDBException { offset = -1; } + @Override + public void refresh(final Snapshot snapshot) throws RocksDBException { + offset = -1; + } + @Override public void status() throws RocksDBException { if(offset < 0 || offset >= entries.size()) { diff --git a/java/src/test/java/org/rocksdb/util/StdErrLoggerTest.java b/java/src/test/java/org/rocksdb/util/StdErrLoggerTest.java new file mode 100644 index 00000000000..0788ec92e87 --- /dev/null +++ b/java/src/test/java/org/rocksdb/util/StdErrLoggerTest.java @@ -0,0 +1,45 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb.util; + +import org.junit.ClassRule; +import org.junit.Test; +import org.rocksdb.DBOptions; +import org.rocksdb.InfoLogLevel; +import org.rocksdb.Options; +import org.rocksdb.RocksNativeLibraryResource; + +public class StdErrLoggerTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + // Logging with the stderr logger would pollute the console when tests were run (and + // from Java, we can't redirect or close stderr). + // So we just test creation of a StdErrLogger and setting it on Options + // without opening the DB. + @Test + public void nativeLoggersWithOptions() { + try (final Options options = new Options().setCreateIfMissing(true); + final StdErrLogger stdErrLogger = + new StdErrLogger(InfoLogLevel.DEBUG_LEVEL, "[Options prefix]")) { + options.setLogger(stdErrLogger); + } + } + + // Logging with the stderr logger would pollute the console when tests were run (and + // from Java, we can't redirect or close stderr). + // So we just test creation of a StdErrLogger and setting it on DBOptions + // without opening the DB. + @Test + public void nativeLoggersWithDBOptions() { + try (final DBOptions options = new DBOptions().setCreateIfMissing(true); + final StdErrLogger stdErrLogger = + new StdErrLogger(InfoLogLevel.DEBUG_LEVEL, "[DBOptions prefix]")) { + options.setLogger(stdErrLogger); + } + } +} diff --git a/logging/auto_roll_logger.cc b/logging/auto_roll_logger.cc index 9e9ad45aee0..c186ab44a96 100644 --- a/logging/auto_roll_logger.cc +++ b/logging/auto_roll_logger.cc @@ -192,14 +192,13 @@ void AutoRollLogger::LogInternal(const char* format, ...) { } void AutoRollLogger::Logv(const char* format, va_list ap) { - assert(GetStatus().ok()); - if (!logger_) { - return; - } - std::shared_ptr logger; { MutexLock l(&mutex_); + assert(GetStatus().ok()); + if (!logger_) { + return; + } if ((kLogFileTimeToRoll > 0 && LogExpired()) || (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) { RollLogFile(); @@ -240,10 +239,6 @@ void AutoRollLogger::WriteHeaderInfo() { } void AutoRollLogger::LogHeader(const char* format, va_list args) { - if (!logger_) { - return; - } - // header message are to be retained in memory. Since we cannot make any // assumptions about the data contained in va_list, we will retain them as // strings @@ -253,6 +248,9 @@ void AutoRollLogger::LogHeader(const char* format, va_list args) { va_end(tmp); MutexLock l(&mutex_); + if (!logger_) { + return; + } headers_.push_back(data); // Log the original message to the current log diff --git a/logging/auto_roll_logger.h b/logging/auto_roll_logger.h index dca9996fea0..be0b14051a6 100644 --- a/logging/auto_roll_logger.h +++ b/logging/auto_roll_logger.h @@ -36,19 +36,18 @@ class AutoRollLogger : public Logger { // Write a header entry to the log. All header information will be written // again every time the log rolls over. - virtual void LogHeader(const char* format, va_list ap) override; + void LogHeader(const char* format, va_list ap) override; // check if the logger has encountered any problem. Status GetStatus() { return status_; } size_t GetLogFileSize() const override { - if (!logger_) { - return 0; - } - std::shared_ptr logger; { MutexLock l(&mutex_); + if (!logger_) { + return 0; + } // pin down the current logger_ instance before releasing the mutex. logger = logger_; } @@ -106,7 +105,7 @@ class AutoRollLogger : public Logger { protected: // Implementation of Close() - virtual Status CloseImpl() override { + Status CloseImpl() override { if (logger_) { return logger_->Close(); } else { diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc index 3d0ec1763f2..344fea96e8b 100644 --- a/logging/auto_roll_logger_test.cc +++ b/logging/auto_roll_logger_test.cc @@ -574,8 +574,8 @@ TEST_F(AutoRollLoggerTest, Close) { static std::vector GetOldFileNames(const std::string& path) { std::vector ret; - const std::string dirname = path.substr(/*start=*/0, path.find_last_of("/")); - const std::string fname = path.substr(path.find_last_of("/") + 1); + const std::string dirname = path.substr(/*start=*/0, path.find_last_of('/')); + const std::string fname = path.substr(path.find_last_of('/') + 1); std::vector children; EXPECT_OK(Env::Default()->GetChildren(dirname, &children)); diff --git a/logging/env_logger.h b/logging/env_logger.h index fc9b245504f..b236dc817cd 100644 --- a/logging/env_logger.h +++ b/logging/env_logger.h @@ -75,7 +75,7 @@ class EnvLogger : public Logger { mutex_.AssertHeld(); if (flush_pending_) { flush_pending_ = false; - file_.Flush().PermitUncheckedError(); + file_.Flush(IOOptions()).PermitUncheckedError(); file_.reset_seen_error(); } last_flush_micros_ = clock_->NowMicros(); @@ -93,7 +93,7 @@ class EnvLogger : public Logger { Status CloseHelper() { FileOpGuard guard(*this); - const auto close_status = file_.Close(); + const auto close_status = file_.Close(IOOptions()); if (close_status.ok()) { return close_status; @@ -162,7 +162,7 @@ class EnvLogger : public Logger { { FileOpGuard guard(*this); // We will ignore any error returned by Append(). - file_.Append(Slice(base, p - base)).PermitUncheckedError(); + file_.Append(IOOptions(), Slice(base, p - base)).PermitUncheckedError(); file_.reset_seen_error(); flush_pending_ = true; const uint64_t now_micros = clock_->NowMicros(); diff --git a/logging/env_logger_test.cc b/logging/env_logger_test.cc index 467ab064f4c..21db8b658e0 100644 --- a/logging/env_logger_test.cc +++ b/logging/env_logger_test.cc @@ -138,7 +138,7 @@ TEST_F(EnvLoggerTest, ConcurrentLogging) { const int kNumThreads = 5; // Create threads. for (int ii = 0; ii < kNumThreads; ++ii) { - threads.push_back(port::Thread(cb)); + threads.emplace_back(cb); } // Wait for them to complete. diff --git a/logging/log_buffer.h b/logging/log_buffer.h index 92d38d10d14..11945e6e782 100644 --- a/logging/log_buffer.h +++ b/logging/log_buffer.h @@ -49,9 +49,9 @@ class LogBuffer { // Add log to the LogBuffer for a delayed info logging. It can be used when // we want to add some logs inside a mutex. // max_log_size indicates maximize log size, including some metadata. -extern void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, - const char* format, ...); +void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, const char* format, + ...); // Same as previous function, but with default max log size. -extern void LogToBuffer(LogBuffer* log_buffer, const char* format, ...); +void LogToBuffer(LogBuffer* log_buffer, const char* format, ...); } // namespace ROCKSDB_NAMESPACE diff --git a/memory/arena.h b/memory/arena.h index 39399aa71b4..2257a4935dd 100644 --- a/memory/arena.h +++ b/memory/arena.h @@ -132,4 +132,15 @@ inline char* Arena::Allocate(size_t bytes) { return AllocateFallback(bytes, false /* unaligned */); } +// Like std::destroy_at but a callable type +template +struct Destroyer { + void operator()(T* ptr) { ptr->~T(); } +}; + +// Like std::unique_ptr but only placement-deletes the object (for +// objects allocated on an arena). +template +using ScopedArenaPtr = std::unique_ptr>; + } // namespace ROCKSDB_NAMESPACE diff --git a/memory/arena_test.cc b/memory/arena_test.cc index 592bbd723f5..1547a86d5dd 100644 --- a/memory/arena_test.cc +++ b/memory/arena_test.cc @@ -12,6 +12,7 @@ #ifndef OS_WIN #include #endif +#include "port/jemalloc_helper.h" #include "port/port.h" #include "test_util/testharness.h" #include "util/random.h" @@ -170,7 +171,7 @@ static void SimpleTest(size_t huge_page_size) { r[b] = i % 256; } bytes += s; - allocated.push_back(std::make_pair(s, r)); + allocated.emplace_back(s, r); ASSERT_GE(arena.ApproximateMemoryUsage(), bytes); if (i > N / 10) { ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10); @@ -267,7 +268,21 @@ TEST_F(ArenaTest, UnmappedAllocation) { // Verify that it's possible to get unmapped pages in large allocations, // for memory efficiency and to ensure we don't accidentally waste time & // space initializing the memory. - constexpr size_t kBlockSize = 2U << 20; + +#ifdef ROCKSDB_JEMALLOC + // With Jemalloc config.fill, the pages are written to before we get them + uint8_t fill = 0; + size_t fill_sz = sizeof(fill); + mallctl("config.fill", &fill, &fill_sz, nullptr, 0); + if (fill) { + ROCKSDB_GTEST_BYPASS("Test skipped because of config.fill==true"); + return; + } +#endif // ROCKSDB_JEMALLOC + + // This block size value is smaller than the smallest x86 huge page size, + // so should not be fulfilled by a transparent huge page mapping. + constexpr size_t kBlockSize = 1U << 20; Arena arena(kBlockSize); // The allocator might give us back recycled memory for a while, but diff --git a/memory/jemalloc_nodump_allocator.cc b/memory/jemalloc_nodump_allocator.cc index 9bcd679aeab..50a64692079 100644 --- a/memory/jemalloc_nodump_allocator.cc +++ b/memory/jemalloc_nodump_allocator.cc @@ -221,7 +221,7 @@ int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) { size > options_.tcache_size_upper_bound)) { return MALLOCX_TCACHE_NONE; } - unsigned* tcache_index = reinterpret_cast(tcache_.Get()); + unsigned* tcache_index = static_cast(tcache_.Get()); if (UNLIKELY(tcache_index == nullptr)) { // Instantiate tcache. tcache_index = new unsigned(0); diff --git a/memory/memory_allocator_impl.h b/memory/memory_allocator_impl.h index 68aa35beb86..f1d3b9472cc 100644 --- a/memory/memory_allocator_impl.h +++ b/memory/memory_allocator_impl.h @@ -17,7 +17,7 @@ struct CustomDeleter { void operator()(char* ptr) const { if (allocator) { - allocator->Deallocate(reinterpret_cast(ptr)); + allocator->Deallocate(ptr); } else { delete[] ptr; } diff --git a/memtable/alloc_tracker.cc b/memtable/alloc_tracker.cc index 4c6d3543193..d780df0bf3d 100644 --- a/memtable/alloc_tracker.cc +++ b/memtable/alloc_tracker.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include +#include #include "memory/allocator.h" #include "memory/arena.h" diff --git a/memtable/hash_linklist_rep.cc b/memtable/hash_linklist_rep.cc index 9e60f9be378..7a87c7ed5ff 100644 --- a/memtable/hash_linklist_rep.cc +++ b/memtable/hash_linklist_rep.cc @@ -81,7 +81,7 @@ struct Node { void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); } // Needed for placement new below which is fine - Node() {} + Node() = default; private: std::atomic next_; @@ -265,7 +265,7 @@ class HashLinkListRep : public MemTableRep { explicit FullListIterator(MemtableSkipList* list, Allocator* allocator) : iter_(list), full_list_(list), allocator_(allocator) {} - ~FullListIterator() override {} + ~FullListIterator() override = default; // Returns true iff the iterator is positioned at a valid node. bool Valid() const override { return iter_.Valid(); } @@ -332,7 +332,7 @@ class HashLinkListRep : public MemTableRep { head_(head), node_(nullptr) {} - ~LinkListIterator() override {} + ~LinkListIterator() override = default; // Returns true iff the iterator is positioned at a valid node. bool Valid() const override { return node_ != nullptr; } @@ -482,7 +482,7 @@ class HashLinkListRep : public MemTableRep { // This is used when there wasn't a bucket. It is cheaper than // instantiating an empty bucket over which to iterate. public: - EmptyIterator() {} + EmptyIterator() = default; bool Valid() const override { return false; } const char* key() const override { assert(false); @@ -526,7 +526,7 @@ HashLinkListRep::HashLinkListRep( } } -HashLinkListRep::~HashLinkListRep() {} +HashLinkListRep::~HashLinkListRep() = default; KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) { char* mem = allocator_->AllocateAligned(sizeof(Node) + len); @@ -887,14 +887,15 @@ class HashLinkListRepFactory : public MemTableRepFactory { } using MemTableRepFactory::CreateMemTableRep; - virtual MemTableRep* CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform* transform, Logger* logger) override; + MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& compare, + Allocator* allocator, + const SliceTransform* transform, + Logger* logger) override; static const char* kClassName() { return "HashLinkListRepFactory"; } static const char* kNickName() { return "hash_linkedlist"; } - virtual const char* Name() const override { return kClassName(); } - virtual const char* NickName() const override { return kNickName(); } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } private: HashLinkListRepOptions options_; diff --git a/memtable/hash_skiplist_rep.cc b/memtable/hash_skiplist_rep.cc index 15ff4f0719b..ca4c376db93 100644 --- a/memtable/hash_skiplist_rep.cc +++ b/memtable/hash_skiplist_rep.cc @@ -208,7 +208,7 @@ class HashSkipListRep : public MemTableRep { // This is used when there wasn't a bucket. It is cheaper than // instantiating an empty bucket over which to iterate. public: - EmptyIterator() {} + EmptyIterator() = default; bool Valid() const override { return false; } const char* key() const override { assert(false); @@ -248,7 +248,7 @@ HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare, } } -HashSkipListRep::~HashSkipListRep() {} +HashSkipListRep::~HashSkipListRep() = default; HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket( const Slice& transformed) { @@ -357,15 +357,16 @@ class HashSkipListRepFactory : public MemTableRepFactory { } using MemTableRepFactory::CreateMemTableRep; - virtual MemTableRep* CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform* transform, Logger* logger) override; + MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& compare, + Allocator* allocator, + const SliceTransform* transform, + Logger* logger) override; static const char* kClassName() { return "HashSkipListRepFactory"; } static const char* kNickName() { return "prefix_hash"; } - virtual const char* Name() const override { return kClassName(); } - virtual const char* NickName() const override { return kNickName(); } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } private: HashSkipListRepOptions options_; diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h index abb3c3ddb7f..8e2d548b430 100644 --- a/memtable/inlineskiplist.h +++ b/memtable/inlineskiplist.h @@ -739,7 +739,7 @@ bool InlineSkipList::InsertWithHint(const char* key, void** hint) { Splice* splice = reinterpret_cast(*hint); if (splice == nullptr) { splice = AllocateSplice(); - *hint = reinterpret_cast(splice); + *hint = splice; } return Insert(key, splice, true); } @@ -751,7 +751,7 @@ bool InlineSkipList::InsertWithHintConcurrently(const char* key, Splice* splice = reinterpret_cast(*hint); if (splice == nullptr) { splice = AllocateSpliceOnHeap(); - *hint = reinterpret_cast(splice); + *hint = splice; } return Insert(key, splice, true); } diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc index 930574ec726..5c019dec181 100644 --- a/memtable/inlineskiplist_test.cc +++ b/memtable/inlineskiplist_test.cc @@ -568,7 +568,7 @@ class TestState { }; static void ConcurrentReader(void* arg) { - TestState* state = reinterpret_cast(arg); + TestState* state = static_cast(arg); Random rnd(state->seed_); int64_t reads = 0; state->Change(TestState::RUNNING); @@ -581,7 +581,7 @@ static void ConcurrentReader(void* arg) { } static void ConcurrentWriter(void* arg) { - TestState* state = reinterpret_cast(arg); + TestState* state = static_cast(arg); uint32_t k = state->next_writer_++ % ConcurrentTest::K; state->t_.ConcurrentWriteStep(k, state->use_hint_); state->AdjustPendingWriters(-1); diff --git a/memtable/skiplist_test.cc b/memtable/skiplist_test.cc index 868c51876e9..eb5d2e7201c 100644 --- a/memtable/skiplist_test.cc +++ b/memtable/skiplist_test.cc @@ -340,7 +340,7 @@ class TestState { }; static void ConcurrentReader(void* arg) { - TestState* state = reinterpret_cast(arg); + TestState* state = static_cast(arg); Random rnd(state->seed_); int64_t reads = 0; state->Change(TestState::RUNNING); diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index c3b4c785d38..e615ef9f68c 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -161,7 +161,7 @@ class SkipListRep : public MemTableRep { } } - ~SkipListRep() override {} + ~SkipListRep() override = default; // Iteration over the contents of a skip list class Iterator : public MemTableRep::Iterator { @@ -174,7 +174,7 @@ class SkipListRep : public MemTableRep { const InlineSkipList* list) : iter_(list) {} - ~Iterator() override {} + ~Iterator() override = default; // Returns true iff the iterator is positioned at a valid node. bool Valid() const override { return iter_.Valid(); } @@ -232,7 +232,7 @@ class SkipListRep : public MemTableRep { explicit LookaheadIterator(const SkipListRep& rep) : rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {} - ~LookaheadIterator() override {} + ~LookaheadIterator() override = default; bool Valid() const override { return iter_.Valid(); } diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc index e42ae4439c8..9b0192cb8e8 100644 --- a/memtable/vectorrep.cc +++ b/memtable/vectorrep.cc @@ -40,7 +40,7 @@ class VectorRep : public MemTableRep { void Get(const LookupKey& k, void* callback_args, bool (*callback_func)(void* arg, const char* entry)) override; - ~VectorRep() override {} + ~VectorRep() override = default; class Iterator : public MemTableRep::Iterator { class VectorRep* vrep_; @@ -59,7 +59,7 @@ class VectorRep : public MemTableRep { // Initialize an iterator over the specified collection. // The returned iterator is not valid. // explicit Iterator(const MemTableRep* collection); - ~Iterator() override{}; + ~Iterator() override = default; // Returns true iff the iterator is positioned at a valid node. bool Valid() const override; diff --git a/microbench/db_basic_bench.cc b/microbench/db_basic_bench.cc index c2e547f607b..2eca31f1084 100644 --- a/microbench/db_basic_bench.cc +++ b/microbench/db_basic_bench.cc @@ -543,7 +543,9 @@ BENCHMARK(ManualFlush)->Iterations(1)->Apply(ManualFlushArguments); static Slice CompressibleString(Random* rnd, double compressed_fraction, int len, std::string* dst) { int raw = static_cast(len * compressed_fraction); - if (raw < 1) raw = 1; + if (raw < 1) { + raw = 1; + } std::string raw_data = rnd->RandomBinaryString(raw); // Duplicate the random data until we have filled "len" bytes diff --git a/monitoring/file_read_sample.h b/monitoring/file_read_sample.h index 82a933e0a18..3937199cd39 100644 --- a/monitoring/file_read_sample.h +++ b/monitoring/file_read_sample.h @@ -9,8 +9,8 @@ namespace ROCKSDB_NAMESPACE { static const uint32_t kFileReadSampleRate = 1024; -extern bool should_sample_file_read(); -extern void sample_file_read_inc(FileMetaData*); +bool should_sample_file_read(); +void sample_file_read_inc(FileMetaData*); inline bool should_sample_file_read() { return (Random::GetTLSInstance()->Next() % kFileReadSampleRate == 307); diff --git a/monitoring/histogram.cc b/monitoring/histogram.cc index 61bc6c14097..e27a63edbd1 100644 --- a/monitoring/histogram.cc +++ b/monitoring/histogram.cc @@ -9,12 +9,11 @@ #include "monitoring/histogram.h" -#include - #include #include #include #include +#include #include "port/port.h" #include "util/cast_util.h" @@ -45,10 +44,11 @@ HistogramBucketMapper::HistogramBucketMapper() { size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { auto beg = bucketValues_.begin(); auto end = bucketValues_.end(); - if (value >= maxBucketValue_) + if (value >= maxBucketValue_) { return end - beg - 1; // bucketValues_.size() - 1 - else + } else { return std::lower_bound(beg, end, value) - beg; + } } namespace { @@ -69,7 +69,7 @@ void HistogramStat::Clear() { for (unsigned int b = 0; b < num_buckets_; b++) { buckets_[b].store(0, std::memory_order_relaxed); } -}; +} bool HistogramStat::Empty() const { return num() == 0; } @@ -147,8 +147,12 @@ double HistogramStat::Percentile(double p) const { double r = left_point + (right_point - left_point) * pos; uint64_t cur_min = min(); uint64_t cur_max = max(); - if (r < cur_min) r = static_cast(cur_min); - if (r > cur_max) r = static_cast(cur_max); + if (r < cur_min) { + r = static_cast(cur_min); + } + if (r > cur_max) { + r = static_cast(cur_max); + } return r; } } @@ -158,7 +162,9 @@ double HistogramStat::Percentile(double p) const { double HistogramStat::Average() const { uint64_t cur_num = num(); uint64_t cur_sum = sum(); - if (cur_num == 0) return 0; + if (cur_num == 0) { + return 0; + } return static_cast(cur_sum) / static_cast(cur_num); } @@ -193,12 +199,16 @@ std::string HistogramStat::ToString() const { Percentile(99.99)); r.append(buf); r.append("------------------------------------------------------\n"); - if (cur_num == 0) return r; // all buckets are empty + if (cur_num == 0) { + return r; // all buckets are empty + } const double mult = 100.0 / cur_num; uint64_t cumulative_sum = 0; for (unsigned int b = 0; b < num_buckets_; b++) { uint64_t bucket_value = bucket_at(b); - if (bucket_value <= 0.0) continue; + if (bucket_value <= 0.0) { + continue; + } cumulative_sum += bucket_value; snprintf(buf, sizeof(buf), "%c %7" PRIu64 ", %7" PRIu64 " ] %8" PRIu64 " %7.3f%% %7.3f%% ", diff --git a/monitoring/histogram.h b/monitoring/histogram.h index 15fee2b4f8d..ccfdd8ddd9a 100644 --- a/monitoring/histogram.h +++ b/monitoring/histogram.h @@ -88,7 +88,7 @@ struct HistogramStat { class Histogram { public: Histogram() {} - virtual ~Histogram(){}; + virtual ~Histogram(){} virtual void Clear() = 0; virtual bool Empty() const = 0; @@ -114,22 +114,22 @@ class HistogramImpl : public Histogram { HistogramImpl(const HistogramImpl&) = delete; HistogramImpl& operator=(const HistogramImpl&) = delete; - virtual void Clear() override; - virtual bool Empty() const override; - virtual void Add(uint64_t value) override; - virtual void Merge(const Histogram& other) override; + void Clear() override; + bool Empty() const override; + void Add(uint64_t value) override; + void Merge(const Histogram& other) override; void Merge(const HistogramImpl& other); - virtual std::string ToString() const override; - virtual const char* Name() const override { return "HistogramImpl"; } - virtual uint64_t min() const override { return stats_.min(); } - virtual uint64_t max() const override { return stats_.max(); } - virtual uint64_t num() const override { return stats_.num(); } - virtual double Median() const override; - virtual double Percentile(double p) const override; - virtual double Average() const override; - virtual double StandardDeviation() const override; - virtual void Data(HistogramData* const data) const override; + std::string ToString() const override; + const char* Name() const override { return "HistogramImpl"; } + uint64_t min() const override { return stats_.min(); } + uint64_t max() const override { return stats_.max(); } + uint64_t num() const override { return stats_.num(); } + double Median() const override; + double Percentile(double p) const override; + double Average() const override; + double StandardDeviation() const override; + void Data(HistogramData* const data) const override; virtual ~HistogramImpl() {} diff --git a/monitoring/histogram_windowing.cc b/monitoring/histogram_windowing.cc index c41ae8a03de..726231a71c2 100644 --- a/monitoring/histogram_windowing.cc +++ b/monitoring/histogram_windowing.cc @@ -34,7 +34,7 @@ HistogramWindowingImpl::HistogramWindowingImpl(uint64_t num_windows, Clear(); } -HistogramWindowingImpl::~HistogramWindowingImpl() {} +HistogramWindowingImpl::~HistogramWindowingImpl() = default; void HistogramWindowingImpl::Clear() { std::lock_guard lock(mutex_); @@ -159,7 +159,9 @@ void HistogramWindowingImpl::SwapHistoryBucket() { for (unsigned int i = 0; i < num_windows_; i++) { if (i != next_window) { uint64_t m = window_stats_[i].min(); - if (m < new_min) new_min = m; + if (m < new_min) { + new_min = m; + } } } stats_.min_.store(new_min, std::memory_order_relaxed); @@ -170,7 +172,9 @@ void HistogramWindowingImpl::SwapHistoryBucket() { for (unsigned int i = 0; i < num_windows_; i++) { if (i != next_window) { uint64_t m = window_stats_[i].max(); - if (m > new_max) new_max = m; + if (m > new_max) { + new_max = m; + } } } stats_.max_.store(new_max, std::memory_order_relaxed); diff --git a/monitoring/histogram_windowing.h b/monitoring/histogram_windowing.h index 9a862671f4f..9f956e19305 100644 --- a/monitoring/histogram_windowing.h +++ b/monitoring/histogram_windowing.h @@ -25,22 +25,22 @@ class HistogramWindowingImpl : public Histogram { ~HistogramWindowingImpl(); - virtual void Clear() override; - virtual bool Empty() const override; - virtual void Add(uint64_t value) override; - virtual void Merge(const Histogram& other) override; + void Clear() override; + bool Empty() const override; + void Add(uint64_t value) override; + void Merge(const Histogram& other) override; void Merge(const HistogramWindowingImpl& other); - virtual std::string ToString() const override; - virtual const char* Name() const override { return "HistogramWindowingImpl"; } - virtual uint64_t min() const override { return stats_.min(); } - virtual uint64_t max() const override { return stats_.max(); } - virtual uint64_t num() const override { return stats_.num(); } - virtual double Median() const override; - virtual double Percentile(double p) const override; - virtual double Average() const override; - virtual double StandardDeviation() const override; - virtual void Data(HistogramData* const data) const override; + std::string ToString() const override; + const char* Name() const override { return "HistogramWindowingImpl"; } + uint64_t min() const override { return stats_.min(); } + uint64_t max() const override { return stats_.max(); } + uint64_t num() const override { return stats_.num(); } + double Median() const override; + double Percentile(double p) const override; + double Average() const override; + double StandardDeviation() const override; + void Data(HistogramData* const data) const override; #ifndef NDEBUG void TEST_UpdateClock(const std::shared_ptr& clock) { diff --git a/monitoring/in_memory_stats_history.cc b/monitoring/in_memory_stats_history.cc index 568d8ec134f..7b7c8db83c4 100644 --- a/monitoring/in_memory_stats_history.cc +++ b/monitoring/in_memory_stats_history.cc @@ -12,7 +12,7 @@ namespace ROCKSDB_NAMESPACE { -InMemoryStatsHistoryIterator::~InMemoryStatsHistoryIterator() {} +InMemoryStatsHistoryIterator::~InMemoryStatsHistoryIterator() = default; bool InMemoryStatsHistoryIterator::Valid() const { return valid_; } diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc index eda3f0ddc01..5fabb5d9904 100644 --- a/monitoring/perf_context.cc +++ b/monitoring/perf_context.cc @@ -62,6 +62,10 @@ struct PerfContextByLevelInt { defCmd(block_cache_filter_hit_count) \ defCmd(filter_block_read_count) \ defCmd(compression_dict_block_read_count) \ + defCmd(block_cache_index_read_byte) \ + defCmd(block_cache_filter_read_byte) \ + defCmd(block_cache_compression_dict_read_byte) \ + defCmd(block_cache_read_byte) \ defCmd(multiget_sst_file_read_count) \ defCmd(multiget_sst_serialized_file_read_count) \ defCmd(secondary_cache_hit_count) \ diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h index 5b66ff2ff99..b0fad2c7aff 100644 --- a/monitoring/perf_context_imp.h +++ b/monitoring/perf_context_imp.h @@ -29,6 +29,7 @@ extern thread_local PerfContext perf_context; #define PERF_CPU_TIMER_GUARD(metric, clock) #define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ ticker_type) +#define PERF_TIMER_FOR_WAIT_GUARD(metric) #define PERF_TIMER_MEASURE(metric) #define PERF_COUNTER_ADD(metric, value) #define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) @@ -66,6 +67,11 @@ extern thread_local PerfContext perf_context; perf_step_timer_##metric.Start(); \ } +#define PERF_TIMER_FOR_WAIT_GUARD(metric) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), nullptr, \ + false, PerfLevel::kEnableWait); \ + perf_step_timer_##metric.Start(); + // Update metric with time elapsed since last START. start time is reset // to current timestamp. #define PERF_TIMER_MEASURE(metric) perf_step_timer_##metric.Measure(); @@ -74,7 +80,8 @@ extern thread_local PerfContext perf_context; #define PERF_COUNTER_ADD(metric, value) \ if (perf_level >= PerfLevel::kEnableCount) { \ perf_context.metric += value; \ - } + } \ + static_assert(true, "semicolon required") // Increase metric value #define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \ diff --git a/monitoring/perf_level.cc b/monitoring/perf_level.cc index e3507624b63..9ba4e01637c 100644 --- a/monitoring/perf_level.cc +++ b/monitoring/perf_level.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). // -#include +#include #include "monitoring/perf_level_imp.h" diff --git a/monitoring/persistent_stats_history.cc b/monitoring/persistent_stats_history.cc index f4c022148c8..8c077c55f0c 100644 --- a/monitoring/persistent_stats_history.cc +++ b/monitoring/persistent_stats_history.cc @@ -41,6 +41,8 @@ Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type, } else if (type == StatsVersionKeyType::kCompatibleVersion) { key = kCompatibleVersionKeyString; } + + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions options; options.verify_checksums = true; std::string result; @@ -74,7 +76,7 @@ void OptimizeForPersistentStats(ColumnFamilyOptions* cfo) { cfo->compression = kNoCompression; } -PersistentStatsHistoryIterator::~PersistentStatsHistoryIterator() {} +PersistentStatsHistoryIterator::~PersistentStatsHistoryIterator() = default; bool PersistentStatsHistoryIterator::Valid() const { return valid_; } @@ -96,7 +98,7 @@ std::pair parseKey(const Slice& key, uint64_t start_time) { std::pair result; std::string key_str = key.ToString(); - std::string::size_type pos = key_str.find("#"); + std::string::size_type pos = key_str.find('#'); // TODO(Zhongyi): add counters to track parse failures? if (pos == std::string::npos) { result.first = std::numeric_limits::max(); @@ -122,6 +124,7 @@ void PersistentStatsHistoryIterator::AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time) { // try to find next entry in stats_history_ map if (db_impl_ != nullptr) { + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; Iterator* iter = db_impl_->NewIterator(ro, db_impl_->PersistentStatsColumnFamily()); diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index ebfd443002f..ed9a089af51 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -39,10 +39,42 @@ const std::vector> TickersNameMap = { {BLOCK_CACHE_DATA_BYTES_INSERT, "rocksdb.block.cache.data.bytes.insert"}, {BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"}, {BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"}, + {BLOCK_CACHE_COMPRESSION_DICT_MISS, + "rocksdb.block.cache.compression.dict.miss"}, + {BLOCK_CACHE_COMPRESSION_DICT_HIT, + "rocksdb.block.cache.compression.dict.hit"}, + {BLOCK_CACHE_COMPRESSION_DICT_ADD, + "rocksdb.block.cache.compression.dict.add"}, + {BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, + "rocksdb.block.cache.compression.dict.bytes.insert"}, + {BLOCK_CACHE_ADD_REDUNDANT, "rocksdb.block.cache.add.redundant"}, + {BLOCK_CACHE_INDEX_ADD_REDUNDANT, + "rocksdb.block.cache.index.add.redundant"}, + {BLOCK_CACHE_FILTER_ADD_REDUNDANT, + "rocksdb.block.cache.filter.add.redundant"}, + {BLOCK_CACHE_DATA_ADD_REDUNDANT, "rocksdb.block.cache.data.add.redundant"}, + {BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT, + "rocksdb.block.cache.compression.dict.add.redundant"}, + {SECONDARY_CACHE_HITS, "rocksdb.secondary.cache.hits"}, + {SECONDARY_CACHE_FILTER_HITS, "rocksdb.secondary.cache.filter.hits"}, + {SECONDARY_CACHE_INDEX_HITS, "rocksdb.secondary.cache.index.hits"}, + {SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"}, + {COMPRESSED_SECONDARY_CACHE_DUMMY_HITS, + "rocksdb.compressed.secondary.cache.dummy.hits"}, + {COMPRESSED_SECONDARY_CACHE_HITS, + "rocksdb.compressed.secondary.cache.hits"}, + {COMPRESSED_SECONDARY_CACHE_PROMOTIONS, + "rocksdb.compressed.secondary.cache.promotions"}, + {COMPRESSED_SECONDARY_CACHE_PROMOTION_SKIPS, + "rocksdb.compressed.secondary.cache.promotion.skips"}, {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"}, {BLOOM_FILTER_FULL_POSITIVE, "rocksdb.bloom.filter.full.positive"}, {BLOOM_FILTER_FULL_TRUE_POSITIVE, "rocksdb.bloom.filter.full.true.positive"}, + {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"}, + {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"}, + {BLOOM_FILTER_PREFIX_TRUE_POSITIVE, + "rocksdb.bloom.filter.prefix.true.positive"}, {PERSISTENT_CACHE_HIT, "rocksdb.persistent.cache.hit"}, {PERSISTENT_CACHE_MISS, "rocksdb.persistent.cache.miss"}, {SIM_BLOCK_CACHE_HIT, "rocksdb.sim.block.cache.hit"}, @@ -73,6 +105,10 @@ const std::vector> TickersNameMap = { {NUMBER_DB_NEXT_FOUND, "rocksdb.number.db.next.found"}, {NUMBER_DB_PREV_FOUND, "rocksdb.number.db.prev.found"}, {ITER_BYTES_READ, "rocksdb.db.iter.bytes.read"}, + {NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"}, + {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"}, + {NO_ITERATOR_CREATED, "rocksdb.num.iterator.created"}, + {NO_ITERATOR_DELETED, "rocksdb.num.iterator.deleted"}, {NO_FILE_OPENS, "rocksdb.no.file.opens"}, {NO_FILE_ERRORS, "rocksdb.no.file.errors"}, {STALL_MICROS, "rocksdb.stall.micros"}, @@ -80,12 +116,8 @@ const std::vector> TickersNameMap = { {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"}, {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"}, {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"}, + {NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"}, {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"}, - {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"}, - {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"}, - {BLOOM_FILTER_PREFIX_TRUE_POSITIVE, - "rocksdb.bloom.filter.prefix.true.positive"}, - {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"}, {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"}, {WAL_FILE_SYNCED, "rocksdb.wal.synced"}, {WAL_FILE_BYTES, "rocksdb.wal.bytes"}, @@ -108,7 +140,16 @@ const std::vector> TickersNameMap = { {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"}, {NUMBER_BLOCK_COMPRESSED, "rocksdb.number.block.compressed"}, {NUMBER_BLOCK_DECOMPRESSED, "rocksdb.number.block.decompressed"}, - {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"}, + {BYTES_COMPRESSED_FROM, "rocksdb.bytes.compressed.from"}, + {BYTES_COMPRESSED_TO, "rocksdb.bytes.compressed.to"}, + {BYTES_COMPRESSION_BYPASSED, "rocksdb.bytes.compression_bypassed"}, + {BYTES_COMPRESSION_REJECTED, "rocksdb.bytes.compression.rejected"}, + {NUMBER_BLOCK_COMPRESSION_BYPASSED, + "rocksdb.number.block_compression_bypassed"}, + {NUMBER_BLOCK_COMPRESSION_REJECTED, + "rocksdb.number.block_compression_rejected"}, + {BYTES_DECOMPRESSED_FROM, "rocksdb.bytes.decompressed.from"}, + {BYTES_DECOMPRESSED_TO, "rocksdb.bytes.decompressed.to"}, {MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"}, {FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"}, {COMPACTION_CPU_TOTAL_TIME, "rocksdb.compaction.total.time.cpu_micros"}, @@ -117,7 +158,6 @@ const std::vector> TickersNameMap = { {READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"}, {READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"}, {NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"}, - {NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"}, {BLOB_DB_NUM_PUT, "rocksdb.blobdb.num.put"}, {BLOB_DB_NUM_WRITE, "rocksdb.blobdb.num.write"}, {BLOB_DB_NUM_GET, "rocksdb.blobdb.num.get"}, @@ -150,45 +190,26 @@ const std::vector> TickersNameMap = { {BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"}, {BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"}, {BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"}, + {BLOB_DB_CACHE_MISS, "rocksdb.blobdb.cache.miss"}, + {BLOB_DB_CACHE_HIT, "rocksdb.blobdb.cache.hit"}, + {BLOB_DB_CACHE_ADD, "rocksdb.blobdb.cache.add"}, + {BLOB_DB_CACHE_ADD_FAILURES, "rocksdb.blobdb.cache.add.failures"}, + {BLOB_DB_CACHE_BYTES_READ, "rocksdb.blobdb.cache.bytes.read"}, + {BLOB_DB_CACHE_BYTES_WRITE, "rocksdb.blobdb.cache.bytes.write"}, {TXN_PREPARE_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.prepare"}, {TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.old.commit.map"}, {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"}, {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"}, {TXN_GET_TRY_AGAIN, "rocksdb.txn.get.tryagain"}, - {NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"}, - {NO_ITERATOR_CREATED, "rocksdb.num.iterator.created"}, - {NO_ITERATOR_DELETED, "rocksdb.num.iterator.deleted"}, - {BLOCK_CACHE_COMPRESSION_DICT_MISS, - "rocksdb.block.cache.compression.dict.miss"}, - {BLOCK_CACHE_COMPRESSION_DICT_HIT, - "rocksdb.block.cache.compression.dict.hit"}, - {BLOCK_CACHE_COMPRESSION_DICT_ADD, - "rocksdb.block.cache.compression.dict.add"}, - {BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, - "rocksdb.block.cache.compression.dict.bytes.insert"}, - {BLOCK_CACHE_ADD_REDUNDANT, "rocksdb.block.cache.add.redundant"}, - {BLOCK_CACHE_INDEX_ADD_REDUNDANT, - "rocksdb.block.cache.index.add.redundant"}, - {BLOCK_CACHE_FILTER_ADD_REDUNDANT, - "rocksdb.block.cache.filter.add.redundant"}, - {BLOCK_CACHE_DATA_ADD_REDUNDANT, "rocksdb.block.cache.data.add.redundant"}, - {BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT, - "rocksdb.block.cache.compression.dict.add.redundant"}, {FILES_MARKED_TRASH, "rocksdb.files.marked.trash"}, {FILES_DELETED_FROM_TRASH_QUEUE, "rocksdb.files.marked.trash.deleted"}, {FILES_DELETED_IMMEDIATELY, "rocksdb.files.deleted.immediately"}, {ERROR_HANDLER_BG_ERROR_COUNT, "rocksdb.error.handler.bg.error.count"}, - {ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED, - "rocksdb.error.handler.bg.errro.count"}, {ERROR_HANDLER_BG_IO_ERROR_COUNT, "rocksdb.error.handler.bg.io.error.count"}, - {ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED, - "rocksdb.error.handler.bg.io.errro.count"}, {ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, "rocksdb.error.handler.bg.retryable.io.error.count"}, - {ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT_MISSPELLED, - "rocksdb.error.handler.bg.retryable.io.errro.count"}, {ERROR_HANDLER_AUTORESUME_COUNT, "rocksdb.error.handler.autoresume.count"}, {ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, "rocksdb.error.handler.autoresume.retry.total.count"}, @@ -198,7 +219,6 @@ const std::vector> TickersNameMap = { "rocksdb.memtable.payload.bytes.at.flush"}, {MEMTABLE_GARBAGE_BYTES_AT_FLUSH, "rocksdb.memtable.garbage.bytes.at.flush"}, - {SECONDARY_CACHE_HITS, "rocksdb.secondary.cache.hits"}, {VERIFY_CHECKSUM_READ_BYTES, "rocksdb.verify_checksum.read.bytes"}, {BACKUP_READ_BYTES, "rocksdb.backup.read.bytes"}, {BACKUP_WRITE_BYTES, "rocksdb.backup.write.bytes"}, @@ -232,38 +252,20 @@ const std::vector> TickersNameMap = { {BLOCK_CHECKSUM_COMPUTE_COUNT, "rocksdb.block.checksum.compute.count"}, {BLOCK_CHECKSUM_MISMATCH_COUNT, "rocksdb.block.checksum.mismatch.count"}, {MULTIGET_COROUTINE_COUNT, "rocksdb.multiget.coroutine.count"}, - {BLOB_DB_CACHE_MISS, "rocksdb.blobdb.cache.miss"}, - {BLOB_DB_CACHE_HIT, "rocksdb.blobdb.cache.hit"}, - {BLOB_DB_CACHE_ADD, "rocksdb.blobdb.cache.add"}, - {BLOB_DB_CACHE_ADD_FAILURES, "rocksdb.blobdb.cache.add.failures"}, - {BLOB_DB_CACHE_BYTES_READ, "rocksdb.blobdb.cache.bytes.read"}, - {BLOB_DB_CACHE_BYTES_WRITE, "rocksdb.blobdb.cache.bytes.write"}, {READ_ASYNC_MICROS, "rocksdb.read.async.micros"}, {ASYNC_READ_ERROR_COUNT, "rocksdb.async.read.error.count"}, - {SECONDARY_CACHE_FILTER_HITS, "rocksdb.secondary.cache.filter.hits"}, - {SECONDARY_CACHE_INDEX_HITS, "rocksdb.secondary.cache.index.hits"}, - {SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"}, {TABLE_OPEN_PREFETCH_TAIL_MISS, "rocksdb.table.open.prefetch.tail.miss"}, {TABLE_OPEN_PREFETCH_TAIL_HIT, "rocksdb.table.open.prefetch.tail.hit"}, {TIMESTAMP_FILTER_TABLE_CHECKED, "rocksdb.timestamp.filter.table.checked"}, {TIMESTAMP_FILTER_TABLE_FILTERED, "rocksdb.timestamp.filter.table.filtered"}, - {BYTES_COMPRESSED_FROM, "rocksdb.bytes.compressed.from"}, - {BYTES_COMPRESSED_TO, "rocksdb.bytes.compressed.to"}, - {BYTES_COMPRESSION_BYPASSED, "rocksdb.bytes.compression_bypassed"}, - {BYTES_COMPRESSION_REJECTED, "rocksdb.bytes.compression.rejected"}, - {NUMBER_BLOCK_COMPRESSION_BYPASSED, - "rocksdb.number.block_compression_bypassed"}, - {NUMBER_BLOCK_COMPRESSION_REJECTED, - "rocksdb.number.block_compression_rejected"}, - {BYTES_DECOMPRESSED_FROM, "rocksdb.bytes.decompressed.from"}, - {BYTES_DECOMPRESSED_TO, "rocksdb.bytes.decompressed.to"}, {READAHEAD_TRIMMED, "rocksdb.readahead.trimmed"}, {FIFO_MAX_SIZE_COMPACTIONS, "rocksdb.fifo.max.size.compactions"}, {FIFO_TTL_COMPACTIONS, "rocksdb.fifo.ttl.compactions"}, {PREFETCH_BYTES, "rocksdb.prefetch.bytes"}, {PREFETCH_BYTES_USEFUL, "rocksdb.prefetch.bytes.useful"}, {PREFETCH_HITS, "rocksdb.prefetch.hits"}, + {SST_FOOTER_CORRUPTION_COUNT, "rocksdb.footer.corruption.count"}, }; const std::vector> HistogramsNameMap = { @@ -295,12 +297,14 @@ const std::vector> HistogramsNameMap = { "rocksdb.file.read.verify.db.checksum.micros"}, {FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS, "rocksdb.file.read.verify.file.checksums.micros"}, + {SST_WRITE_MICROS, "rocksdb.sst.write.micros"}, + {FILE_WRITE_FLUSH_MICROS, "rocksdb.file.write.flush.micros"}, + {FILE_WRITE_COMPACTION_MICROS, "rocksdb.file.write.compaction.micros"}, + {FILE_WRITE_DB_OPEN_MICROS, "rocksdb.file.write.db.open.micros"}, {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"}, {BYTES_PER_READ, "rocksdb.bytes.per.read"}, {BYTES_PER_WRITE, "rocksdb.bytes.per.write"}, {BYTES_PER_MULTIGET, "rocksdb.bytes.per.multiget"}, - {BYTES_COMPRESSED, "rocksdb.bytes.compressed"}, - {BYTES_DECOMPRESSED, "rocksdb.bytes.decompressed"}, {COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"}, {DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"}, {READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"}, @@ -319,16 +323,16 @@ const std::vector> HistogramsNameMap = { {BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"}, {FLUSH_TIME, "rocksdb.db.flush.micros"}, {SST_BATCH_SIZE, "rocksdb.sst.batch.size"}, + {MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"}, {NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, "rocksdb.num.index.and.filter.blocks.read.per.level"}, {NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"}, + {NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"}, {ERROR_HANDLER_AUTORESUME_RETRY_COUNT, "rocksdb.error.handler.autoresume.retry.count"}, {ASYNC_READ_BYTES, "rocksdb.async.read.bytes"}, {POLL_WAIT_MICROS, "rocksdb.poll.wait.micros"}, {PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"}, - {MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"}, - {NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"}, {ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"}, {TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, "rocksdb.table.open.prefetch.tail.read.bytes"}, @@ -379,7 +383,7 @@ StatisticsImpl::StatisticsImpl(std::shared_ptr stats) RegisterOptions("StatisticsOptions", &stats_, &stats_type_info); } -StatisticsImpl::~StatisticsImpl() {} +StatisticsImpl::~StatisticsImpl() = default; uint64_t StatisticsImpl::getTickerCount(uint32_t tickerType) const { MutexLock lock(&aggregate_lock_); @@ -538,7 +542,9 @@ std::string StatisticsImpl::ToString() const { bool StatisticsImpl::getTickerMap( std::map* stats_map) const { assert(stats_map); - if (!stats_map) return false; + if (!stats_map) { + return false; + } stats_map->clear(); MutexLock lock(&aggregate_lock_); for (const auto& t : TickersNameMap) { diff --git a/monitoring/statistics_impl.h b/monitoring/statistics_impl.h index e0dc29d2846..5dceeace0e8 100644 --- a/monitoring/statistics_impl.h +++ b/monitoring/statistics_impl.h @@ -46,14 +46,14 @@ class StatisticsImpl : public Statistics { const char* Name() const override { return kClassName(); } static const char* kClassName() { return "BasicStatistics"; } - virtual uint64_t getTickerCount(uint32_t ticker_type) const override; - virtual void histogramData(uint32_t histogram_type, - HistogramData* const data) const override; + uint64_t getTickerCount(uint32_t ticker_type) const override; + void histogramData(uint32_t histogram_type, + HistogramData* const data) const override; std::string getHistogramString(uint32_t histogram_type) const override; - virtual void setTickerCount(uint32_t ticker_type, uint64_t count) override; - virtual uint64_t getAndResetTickerCount(uint32_t ticker_type) override; - virtual void recordTick(uint32_t ticker_type, uint64_t count) override; + void setTickerCount(uint32_t ticker_type, uint64_t count) override; + uint64_t getAndResetTickerCount(uint32_t ticker_type) override; + void recordTick(uint32_t ticker_type, uint64_t count) override; // The function is implemented for now for backward compatibility reason. // In case a user explictly calls it, for example, they may have a wrapped // Statistics object, passing the call to recordTick() into here, nothing @@ -61,13 +61,12 @@ class StatisticsImpl : public Statistics { void measureTime(uint32_t histogramType, uint64_t time) override { recordInHistogram(histogramType, time); } - virtual void recordInHistogram(uint32_t histogram_type, - uint64_t value) override; + void recordInHistogram(uint32_t histogram_type, uint64_t value) override; - virtual Status Reset() override; - virtual std::string ToString() const override; - virtual bool getTickerMap(std::map*) const override; - virtual bool HistEnabledForType(uint32_t type) const override; + Status Reset() override; + std::string ToString() const override; + bool getTickerMap(std::map*) const override; + bool HistEnabledForType(uint32_t type) const override; const Customizable* Inner() const override { return stats_.get(); } diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc index 37db0cfe184..295e7bf3daa 100644 --- a/monitoring/stats_history_test.cc +++ b/monitoring/stats_history_test.cc @@ -45,7 +45,7 @@ class StatsHistoryTest : public DBTestBase { SyncPoint::GetInstance()->SetCallBack( "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) { auto periodic_task_scheduler_ptr = - reinterpret_cast(arg); + static_cast(arg); periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get()); }); } diff --git a/monitoring/thread_status_impl.cc b/monitoring/thread_status_impl.cc index 9619dfd81e3..153753682cf 100644 --- a/monitoring/thread_status_impl.cc +++ b/monitoring/thread_status_impl.cc @@ -67,7 +67,7 @@ const std::string ThreadStatus::MicrosToString(uint64_t micros) { const std::string& ThreadStatus::GetOperationPropertyName( ThreadStatus::OperationType op_type, int i) { - static const std::string empty_str = ""; + static const std::string empty_str; switch (op_type) { case ThreadStatus::OP_COMPACTION: if (i >= NUM_COMPACTION_PROPERTIES) { diff --git a/monitoring/thread_status_util.cc b/monitoring/thread_status_util.cc index 9b66dc28e86..d61bcba1ce5 100644 --- a/monitoring/thread_status_util.cc +++ b/monitoring/thread_status_util.cc @@ -175,8 +175,14 @@ bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) { return false; } +void ThreadStatusUtil::SetEnableTracking(bool /*enable_tracking*/) {} + void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* /*cfd*/) {} +ThreadStatus::OperationType ThreadStatusUtil::GetThreadOperation() { + return ThreadStatus::OperationType::OP_UNKNOWN; +} + void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType /*op*/) {} void ThreadStatusUtil::SetThreadOperationProperty(int /*code*/, @@ -190,7 +196,7 @@ void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType /*state*/) {} void ThreadStatusUtil::NewColumnFamilyInfo(const DB* /*db*/, const ColumnFamilyData* /*cfd*/, const std::string& /*cf_name*/, - const Env* env) {} + const Env* /*env*/) {} void ThreadStatusUtil::EraseColumnFamilyInfo(const ColumnFamilyData* /*cfd*/) {} diff --git a/options/cf_options.cc b/options/cf_options.cc index 7ae7c8ca4b1..f6d86701317 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -266,8 +266,7 @@ static std::unordered_map {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kMutable}}, {"check_flush_compaction_key_order", - {offsetof(struct MutableCFOptions, check_flush_compaction_key_order), - OptionType::kBoolean, OptionVerificationType::kNormal, + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kMutable}}, {"paranoid_file_checks", {offsetof(struct MutableCFOptions, paranoid_file_checks), @@ -297,9 +296,7 @@ static std::unordered_map OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, {"ignore_max_compaction_bytes_for_input", - {offsetof(struct MutableCFOptions, - ignore_max_compaction_bytes_for_input), - OptionType::kBoolean, OptionVerificationType::kNormal, + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kMutable}}, {"expanded_compaction_factor", {0, OptionType::kInt, OptionVerificationType::kDeprecated, @@ -342,6 +339,10 @@ static std::unordered_map {offsetof(struct MutableCFOptions, max_successive_merges), OptionType::kSizeT, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"strict_max_successive_merges", + {offsetof(struct MutableCFOptions, strict_max_successive_merges), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"memtable_huge_page_size", {offsetof(struct MutableCFOptions, memtable_huge_page_size), OptionType::kSizeT, OptionVerificationType::kNormal, @@ -417,7 +418,7 @@ static std::unordered_map // value, say, like "23", which would be assigned to // max_table_files_size. if (name == "compaction_options_fifo" && - value.find("=") == std::string::npos) { + value.find('=') == std::string::npos) { // Old format. Parse just a single uint64_t value. auto options = static_cast(addr); options->max_table_files_size = ParseUint64(value); @@ -448,6 +449,10 @@ static std::unordered_map {offsetof(struct MutableCFOptions, last_level_temperature), OptionType::kTemperature, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"default_write_temperature", + {offsetof(struct MutableCFOptions, default_write_temperature), + OptionType::kTemperature, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"enable_blob_files", {offsetof(struct MutableCFOptions, enable_blob_files), OptionType::kBoolean, OptionVerificationType::kNormal, @@ -529,7 +534,7 @@ static std::unordered_map // This is to handle backward compatibility, where // compression_options was a ":" separated list. if (name == kOptNameCompOpts && - value.find("=") == std::string::npos) { + value.find('=') == std::string::npos) { auto* compression = static_cast(addr); return ParseCompressionOptions(value, name, *compression); } else { @@ -549,7 +554,7 @@ static std::unordered_map // This is to handle backward compatibility, where // compression_options was a ":" separated list. if (name == kOptNameBMCompOpts && - value.find("=") == std::string::npos) { + value.find('=') == std::string::npos) { auto* compression = static_cast(addr); return ParseCompressionOptions(value, name, *compression); } else { @@ -603,9 +608,7 @@ static std::unordered_map OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"level_compaction_dynamic_file_size", - {offsetof(struct ImmutableCFOptions, - level_compaction_dynamic_file_size), - OptionType::kBoolean, OptionVerificationType::kNormal, + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}, {"optimize_filters_for_hits", {offsetof(struct ImmutableCFOptions, optimize_filters_for_hits), @@ -635,7 +638,7 @@ static std::unordered_map {offsetof(struct ImmutableCFOptions, max_write_buffer_number_to_maintain), OptionType::kInt, OptionVerificationType::kNormal, - OptionTypeFlags::kNone, 0}}, + OptionTypeFlags::kNone, nullptr}}, {"max_write_buffer_size_to_maintain", {offsetof(struct ImmutableCFOptions, max_write_buffer_size_to_maintain), @@ -644,7 +647,7 @@ static std::unordered_map {"min_write_buffer_number_to_merge", {offsetof(struct ImmutableCFOptions, min_write_buffer_number_to_merge), OptionType::kInt, OptionVerificationType::kNormal, - OptionTypeFlags::kNone, 0}}, + OptionTypeFlags::kNone, nullptr}}, {"num_levels", {offsetof(struct ImmutableCFOptions, num_levels), OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, @@ -863,7 +866,7 @@ class ConfigurableCFOptions : public ConfigurableMutableCFOptions { return s; } - virtual const void* GetOptionsPtr(const std::string& name) const override { + const void* GetOptionsPtr(const std::string& name) const override { if (name == OptionsHelper::kCFOptionsName) { return &cf_options_; } else { @@ -953,8 +956,6 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options) bloom_locality(cf_options.bloom_locality), level_compaction_dynamic_level_bytes( cf_options.level_compaction_dynamic_level_bytes), - level_compaction_dynamic_file_size( - cf_options.level_compaction_dynamic_file_size), num_levels(cf_options.num_levels), optimize_filters_for_hits(cf_options.optimize_filters_for_hits), force_consistency_checks(cf_options.force_consistency_checks), @@ -1068,6 +1069,8 @@ void MutableCFOptions::Dump(Logger* log) const { ROCKS_LOG_INFO(log, " max_successive_merges: %" ROCKSDB_PRIszt, max_successive_merges); + ROCKS_LOG_INFO(log, " strict_max_successive_merges: %d", + strict_max_successive_merges); ROCKS_LOG_INFO(log, " inplace_update_num_locks: %" ROCKSDB_PRIszt, inplace_update_num_locks); @@ -1089,8 +1092,6 @@ void MutableCFOptions::Dump(Logger* log) const { level0_stop_writes_trigger); ROCKS_LOG_INFO(log, " max_compaction_bytes: %" PRIu64, max_compaction_bytes); - ROCKS_LOG_INFO(log, " ignore_max_compaction_bytes_for_input: %s", - ignore_max_compaction_bytes_for_input ? "true" : "false"); ROCKS_LOG_INFO(log, " target_file_size_base: %" PRIu64, target_file_size_base); ROCKS_LOG_INFO(log, " target_file_size_multiplier: %d", @@ -1119,8 +1120,6 @@ void MutableCFOptions::Dump(Logger* log) const { result.c_str()); ROCKS_LOG_INFO(log, " max_sequential_skip_in_iterations: %" PRIu64, max_sequential_skip_in_iterations); - ROCKS_LOG_INFO(log, " check_flush_compaction_key_order: %d", - check_flush_compaction_key_order); ROCKS_LOG_INFO(log, " paranoid_file_checks: %d", paranoid_file_checks); ROCKS_LOG_INFO(log, " report_bg_io_stats: %d", diff --git a/options/cf_options.h b/options/cf_options.h index f61a2a5460b..fced8f787f0 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -64,8 +64,6 @@ struct ImmutableCFOptions { bool level_compaction_dynamic_level_bytes; - bool level_compaction_dynamic_file_size; - int num_levels; bool optimize_filters_for_hits; @@ -120,6 +118,7 @@ struct MutableCFOptions { memtable_whole_key_filtering(options.memtable_whole_key_filtering), memtable_huge_page_size(options.memtable_huge_page_size), max_successive_merges(options.max_successive_merges), + strict_max_successive_merges(options.strict_max_successive_merges), inplace_update_num_locks(options.inplace_update_num_locks), prefix_extractor(options.prefix_extractor), experimental_mempurge_threshold( @@ -134,8 +133,6 @@ struct MutableCFOptions { level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger), level0_stop_writes_trigger(options.level0_stop_writes_trigger), max_compaction_bytes(options.max_compaction_bytes), - ignore_max_compaction_bytes_for_input( - options.ignore_max_compaction_bytes_for_input), target_file_size_base(options.target_file_size_base), target_file_size_multiplier(options.target_file_size_multiplier), max_bytes_for_level_base(options.max_bytes_for_level_base), @@ -160,18 +157,14 @@ struct MutableCFOptions { prepopulate_blob_cache(options.prepopulate_blob_cache), max_sequential_skip_in_iterations( options.max_sequential_skip_in_iterations), - check_flush_compaction_key_order( - options.check_flush_compaction_key_order), paranoid_file_checks(options.paranoid_file_checks), report_bg_io_stats(options.report_bg_io_stats), compression(options.compression), bottommost_compression(options.bottommost_compression), compression_opts(options.compression_opts), bottommost_compression_opts(options.bottommost_compression_opts), - last_level_temperature(options.last_level_temperature == - Temperature::kUnknown - ? options.bottommost_temperature - : options.last_level_temperature), + last_level_temperature(options.last_level_temperature), + default_write_temperature(options.default_write_temperature), memtable_protection_bytes_per_key( options.memtable_protection_bytes_per_key), block_protection_bytes_per_key(options.block_protection_bytes_per_key), @@ -194,6 +187,7 @@ struct MutableCFOptions { memtable_whole_key_filtering(false), memtable_huge_page_size(0), max_successive_merges(0), + strict_max_successive_merges(false), inplace_update_num_locks(0), prefix_extractor(nullptr), experimental_mempurge_threshold(0.0), @@ -204,7 +198,6 @@ struct MutableCFOptions { level0_slowdown_writes_trigger(0), level0_stop_writes_trigger(0), max_compaction_bytes(0), - ignore_max_compaction_bytes_for_input(true), target_file_size_base(0), target_file_size_multiplier(0), max_bytes_for_level_base(0), @@ -223,12 +216,12 @@ struct MutableCFOptions { blob_file_starting_level(0), prepopulate_blob_cache(PrepopulateBlobCache::kDisable), max_sequential_skip_in_iterations(0), - check_flush_compaction_key_order(true), paranoid_file_checks(false), report_bg_io_stats(false), compression(Snappy_Supported() ? kSnappyCompression : kNoCompression), bottommost_compression(kDisableCompressionOption), last_level_temperature(Temperature::kUnknown), + default_write_temperature(Temperature::kUnknown), memtable_protection_bytes_per_key(0), block_protection_bytes_per_key(0), sample_for_compression(0), @@ -263,6 +256,7 @@ struct MutableCFOptions { bool memtable_whole_key_filtering; size_t memtable_huge_page_size; size_t max_successive_merges; + bool strict_max_successive_merges; size_t inplace_update_num_locks; std::shared_ptr prefix_extractor; // [experimental] @@ -290,7 +284,6 @@ struct MutableCFOptions { int level0_slowdown_writes_trigger; int level0_stop_writes_trigger; uint64_t max_compaction_bytes; - bool ignore_max_compaction_bytes_for_input; uint64_t target_file_size_base; int target_file_size_multiplier; uint64_t max_bytes_for_level_base; @@ -315,7 +308,6 @@ struct MutableCFOptions { // Misc options uint64_t max_sequential_skip_in_iterations; - bool check_flush_compaction_key_order; bool paranoid_file_checks; bool report_bg_io_stats; CompressionType compression; @@ -323,6 +315,7 @@ struct MutableCFOptions { CompressionOptions compression_opts; CompressionOptions bottommost_compression_opts; Temperature last_level_temperature; + Temperature default_write_temperature; uint32_t memtable_protection_bytes_per_key; uint8_t block_protection_bytes_per_key; diff --git a/options/configurable.cc b/options/configurable.cc index 5491336e0a7..134de99a23a 100644 --- a/options/configurable.cc +++ b/options/configurable.cc @@ -37,9 +37,9 @@ Status Configurable::PrepareOptions(const ConfigOptions& opts) { // We ignore the invoke_prepare_options here intentionally, // as if you are here, you must have called PrepareOptions explicitly. Status status = Status::OK(); - for (auto opt_iter : options_) { + for (const auto& opt_iter : options_) { if (opt_iter.type_map != nullptr) { - for (auto map_iter : *(opt_iter.type_map)) { + for (const auto& map_iter : *(opt_iter.type_map)) { auto& opt_info = map_iter.second; if (opt_info.ShouldPrepare()) { status = opt_info.Prepare(opts, map_iter.first, opt_iter.opt_ptr); @@ -56,9 +56,9 @@ Status Configurable::PrepareOptions(const ConfigOptions& opts) { Status Configurable::ValidateOptions(const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { Status status; - for (auto opt_iter : options_) { + for (const auto& opt_iter : options_) { if (opt_iter.type_map != nullptr) { - for (auto map_iter : *(opt_iter.type_map)) { + for (const auto& map_iter : *(opt_iter.type_map)) { auto& opt_info = map_iter.second; if (opt_info.ShouldValidate()) { status = opt_info.Validate(db_opts, cf_opts, map_iter.first, @@ -80,7 +80,7 @@ Status Configurable::ValidateOptions(const DBOptions& db_opts, /*********************************************************************************/ const void* Configurable::GetOptionsPtr(const std::string& name) const { - for (auto o : options_) { + for (const auto& o : options_) { if (o.name == name) { return o.opt_ptr; } @@ -95,7 +95,7 @@ std::string Configurable::GetOptionName(const std::string& opt_name) const { const OptionTypeInfo* ConfigurableHelper::FindOption( const std::vector& options, const std::string& short_name, std::string* opt_name, void** opt_ptr) { - for (auto iter : options) { + for (const auto& iter : options) { if (iter.type_map != nullptr) { const auto opt_info = OptionTypeInfo::Find(short_name, *(iter.type_map), opt_name); @@ -318,21 +318,29 @@ Status ConfigurableHelper::ConfigureSomeOptions( } // End while found one or options remain // Now that we have been through the list, remove any unsupported - for (auto u : unsupported) { + for (const auto& u : unsupported) { auto it = options->find(u); if (it != options->end()) { options->erase(it); } } if (config_options.ignore_unknown_options) { - if (!result.ok()) result.PermitUncheckedError(); - if (!notsup.ok()) notsup.PermitUncheckedError(); + if (!result.ok()) { + result.PermitUncheckedError(); + } + if (!notsup.ok()) { + notsup.PermitUncheckedError(); + } return Status::OK(); } else if (!result.ok()) { - if (!notsup.ok()) notsup.PermitUncheckedError(); + if (!notsup.ok()) { + notsup.PermitUncheckedError(); + } return result; } else if (config_options.ignore_unsupported_options) { - if (!notsup.ok()) notsup.PermitUncheckedError(); + if (!notsup.ok()) { + notsup.PermitUncheckedError(); + } return Status::OK(); } else { return notsup; @@ -374,7 +382,7 @@ Status ConfigurableHelper::ConfigureCustomizableOption( return Status::OK(); } else if (custom == nullptr || !StartsWith(name, custom->GetId() + ".")) { return configurable.ParseOption(copy, opt_info, name, value, opt_ptr); - } else if (value.find("=") != std::string::npos) { + } else if (value.find('=') != std::string::npos) { return custom->ConfigureFromString(copy, value); } else { return custom->ConfigureOption(copy, name, value); diff --git a/options/configurable_test.cc b/options/configurable_test.cc index a03d8f0a52f..3ed2d23e3d9 100644 --- a/options/configurable_test.cc +++ b/options/configurable_test.cc @@ -29,8 +29,7 @@ using GFLAGS_NAMESPACE::ParseCommandLineFlags; DEFINE_bool(enable_print, false, "Print options generated to console."); #endif // GFLAGS -namespace ROCKSDB_NAMESPACE { -namespace test { +namespace ROCKSDB_NAMESPACE::test { class StringLogger : public Logger { public: using Logger::Logv; @@ -436,7 +435,7 @@ TEST_F(ConfigurableTest, AliasOptionsTest) { OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"alias", {offsetof(struct TestOptions, b), OptionType::kBoolean, - OptionVerificationType::kAlias, OptionTypeFlags::kNone, 0}}}; + OptionVerificationType::kAlias, OptionTypeFlags::kNone, nullptr}}}; std::unique_ptr orig; orig.reset(SimpleConfigurable::Create("simple", TestConfigMode::kDefaultMode, &alias_option_info)); @@ -758,7 +757,7 @@ void ConfigurableParamTest::TestConfigureOptions( ASSERT_OK(base->GetOptionNames(config_options, &names)); std::unordered_map unused; bool found_one = false; - for (auto name : names) { + for (const auto& name : names) { std::string value; Status s = base->GetOption(config_options, name, &value); if (s.ok()) { @@ -849,8 +848,7 @@ INSTANTIATE_TEST_CASE_P( "block_size=1024;" "no_block_cache=true;"))); -} // namespace test -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::test int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); diff --git a/options/configurable_test.h b/options/configurable_test.h index 3d6fe84108b..7acac776e62 100644 --- a/options/configurable_test.h +++ b/options/configurable_test.h @@ -33,8 +33,8 @@ struct TestOptions { bool b = false; bool d = true; TestEnum e = TestEnum::kTestA; - std::string s = ""; - std::string u = ""; + std::string s; + std::string u; }; static std::unordered_map simple_option_info = { diff --git a/options/customizable.cc b/options/customizable.cc index 2f154d84c57..ac189730df7 100644 --- a/options/customizable.cc +++ b/options/customizable.cc @@ -73,7 +73,7 @@ bool Customizable::AreEquivalent(const ConfigOptions& config_options, std::string* mismatch) const { if (config_options.sanity_level > ConfigOptions::kSanityLevelNone && this != other) { - const Customizable* custom = reinterpret_cast(other); + const Customizable* custom = static_cast(other); if (custom == nullptr) { // Cast failed return false; } else if (GetId() != custom->GetId()) { diff --git a/options/customizable_test.cc b/options/customizable_test.cc index 0e614ed1608..696f1b25edf 100644 --- a/options/customizable_test.cc +++ b/options/customizable_test.cc @@ -1241,7 +1241,8 @@ class TestSecondaryCache : public SecondaryCache { std::unique_ptr Lookup( const Slice& /*key*/, const Cache::CacheItemHelper* /*helper*/, Cache::CreateContext* /*create_context*/, bool /*wait*/, - bool /*advise_erase*/, bool& kept_in_sec_cache) override { + bool /*advise_erase*/, Statistics* /*stats*/, + bool& kept_in_sec_cache) override { kept_in_sec_cache = true; return nullptr; } @@ -1265,7 +1266,7 @@ class TestStatistics : public StatisticsImpl { class TestFlushBlockPolicyFactory : public FlushBlockPolicyFactory { public: - TestFlushBlockPolicyFactory() {} + TestFlushBlockPolicyFactory() = default; static const char* kClassName() { return "TestFlushBlockPolicyFactory"; } const char* Name() const override { return kClassName(); } diff --git a/options/db_options.cc b/options/db_options.cc index bac17199f15..e93d66a0e37 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -35,12 +35,6 @@ static std::unordered_map {"kSkipAnyCorruptedRecords", WALRecoveryMode::kSkipAnyCorruptedRecords}}; -static std::unordered_map - access_hint_string_map = {{"NONE", DBOptions::AccessHint::NONE}, - {"NORMAL", DBOptions::AccessHint::NORMAL}, - {"SEQUENTIAL", DBOptions::AccessHint::SEQUENTIAL}, - {"WILLNEED", DBOptions::AccessHint::WILLNEED}}; - static std::unordered_map cache_tier_string_map = { {"kVolatileTier", CacheTier::kVolatileTier}, {"kNonVolatileBlockTier", CacheTier::kNonVolatileBlockTier}}; @@ -361,10 +355,8 @@ static std::unordered_map OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"access_hint_on_compaction_start", - OptionTypeInfo::Enum( - offsetof(struct ImmutableDBOptions, - access_hint_on_compaction_start), - &access_hint_string_map)}, + OptionTypeInfo::Enum(0, nullptr, OptionTypeFlags::kNone, + OptionVerificationType::kDeprecated)}, {"info_log_level", OptionTypeInfo::Enum( offsetof(struct ImmutableDBOptions, info_log_level), @@ -739,7 +731,6 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) advise_random_on_open(options.advise_random_on_open), db_write_buffer_size(options.db_write_buffer_size), write_buffer_manager(options.write_buffer_manager), - access_hint_on_compaction_start(options.access_hint_on_compaction_start), random_access_max_buffer_size(options.random_access_max_buffer_size), use_adaptive_mutex(options.use_adaptive_mutex), listeners(options.listeners), @@ -881,8 +872,6 @@ void ImmutableDBOptions::Dump(Logger* log) const { db_write_buffer_size); ROCKS_LOG_HEADER(log, " Options.write_buffer_manager: %p", write_buffer_manager.get()); - ROCKS_LOG_HEADER(log, " Options.access_hint_on_compaction_start: %d", - static_cast(access_hint_on_compaction_start)); ROCKS_LOG_HEADER( log, " Options.random_access_max_buffer_size: %" ROCKSDB_PRIszt, random_access_max_buffer_size); @@ -1020,8 +1009,7 @@ MutableDBOptions::MutableDBOptions() wal_bytes_per_sync(0), strict_bytes_per_sync(false), compaction_readahead_size(0), - max_background_flushes(-1), - daily_offpeak_time_utc("") {} + max_background_flushes(-1) {} MutableDBOptions::MutableDBOptions(const DBOptions& options) : max_background_jobs(options.max_background_jobs), diff --git a/options/db_options.h b/options/db_options.h index a7f54e4fe65..37df2e805dc 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -63,7 +63,6 @@ struct ImmutableDBOptions { bool advise_random_on_open; size_t db_write_buffer_size; std::shared_ptr write_buffer_manager; - DBOptions::AccessHint access_hint_on_compaction_start; size_t random_access_max_buffer_size; bool use_adaptive_mutex; std::vector> listeners; diff --git a/options/offpeak_time_info.h b/options/offpeak_time_info.h index 75d61abb49b..f42ef6dc23d 100644 --- a/options/offpeak_time_info.h +++ b/options/offpeak_time_info.h @@ -5,6 +5,7 @@ #pragma once +#include #include #include "rocksdb/rocksdb_namespace.h" diff --git a/options/options.cc b/options/options.cc index 2615018680a..aa0acf1fb22 100644 --- a/options/options.cc +++ b/options/options.cc @@ -71,8 +71,6 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) max_bytes_for_level_multiplier_additional( options.max_bytes_for_level_multiplier_additional), max_compaction_bytes(options.max_compaction_bytes), - ignore_max_compaction_bytes_for_input( - options.ignore_max_compaction_bytes_for_input), soft_pending_compaction_bytes_limit( options.soft_pending_compaction_bytes_limit), hard_pending_compaction_bytes_limit( @@ -87,6 +85,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) table_properties_collector_factories( options.table_properties_collector_factories), max_successive_merges(options.max_successive_merges), + strict_max_successive_merges(options.strict_max_successive_merges), optimize_filters_for_hits(options.optimize_filters_for_hits), paranoid_file_checks(options.paranoid_file_checks), force_consistency_checks(options.force_consistency_checks), @@ -94,6 +93,8 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) ttl(options.ttl), periodic_compaction_seconds(options.periodic_compaction_seconds), sample_for_compression(options.sample_for_compression), + last_level_temperature(options.last_level_temperature), + default_write_temperature(options.default_write_temperature), default_temperature(options.default_temperature), preclude_last_level_data_seconds( options.preclude_last_level_data_seconds), @@ -127,7 +128,7 @@ ColumnFamilyOptions::ColumnFamilyOptions() ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) : ColumnFamilyOptions(*static_cast(&options)) {} -DBOptions::DBOptions() {} +DBOptions::DBOptions() = default; DBOptions::DBOptions(const Options& options) : DBOptions(*static_cast(&options)) {} @@ -290,8 +291,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER( log, " Options.max_compaction_bytes: %" PRIu64, max_compaction_bytes); - ROCKS_LOG_HEADER(log, " Options.ignore_max_compaction_bytes_for_input: %s", - ignore_max_compaction_bytes_for_input ? "true" : "false"); ROCKS_LOG_HEADER( log, " Options.arena_block_size: %" ROCKSDB_PRIszt, @@ -399,6 +398,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const { log, " Options.max_successive_merges: %" ROCKSDB_PRIszt, max_successive_merges); + ROCKS_LOG_HEADER(log, + " Options.strict_max_successive_merges: %d", + strict_max_successive_merges); ROCKS_LOG_HEADER(log, " Options.optimize_filters_for_hits: %d", optimize_filters_for_hits); @@ -537,7 +539,6 @@ Options* Options::DisableExtraChecks() { // See https://github.com/facebook/rocksdb/issues/9354 force_consistency_checks = false; // Considered but no clear performance impact seen: - // * check_flush_compaction_key_order // * paranoid_checks // * flush_verify_memtable_count // By current API contract, not including @@ -707,4 +708,11 @@ ReadOptions::ReadOptions(bool _verify_checksums, bool _fill_cache) ReadOptions::ReadOptions(Env::IOActivity _io_activity) : io_activity(_io_activity) {} +WriteOptions::WriteOptions(Env::IOActivity _io_activity) + : io_activity(_io_activity) {} + +WriteOptions::WriteOptions(Env::IOPriority _rate_limiter_priority, + Env::IOActivity _io_activity) + : rate_limiter_priority(_rate_limiter_priority), + io_activity(_io_activity) {} } // namespace ROCKSDB_NAMESPACE diff --git a/options/options_helper.cc b/options/options_helper.cc index 8932c5c8dad..fce75e41d65 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -46,7 +46,9 @@ Status ValidateOptions(const DBOptions& db_opts, auto db_cfg = DBOptionsAsConfigurable(db_opts); auto cf_cfg = CFOptionsAsConfigurable(cf_opts); s = db_cfg->ValidateOptions(db_opts, cf_opts); - if (s.ok()) s = cf_cfg->ValidateOptions(db_opts, cf_opts); + if (s.ok()) { + s = cf_cfg->ValidateOptions(db_opts, cf_opts); + } return s; } @@ -118,8 +120,6 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.advise_random_on_open = immutable_db_options.advise_random_on_open; options.db_write_buffer_size = immutable_db_options.db_write_buffer_size; options.write_buffer_manager = immutable_db_options.write_buffer_manager; - options.access_hint_on_compaction_start = - immutable_db_options.access_hint_on_compaction_start; options.compaction_readahead_size = mutable_db_options.compaction_readahead_size; options.random_access_max_buffer_size = @@ -208,6 +208,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, cf_opts->memtable_whole_key_filtering = moptions.memtable_whole_key_filtering; cf_opts->memtable_huge_page_size = moptions.memtable_huge_page_size; cf_opts->max_successive_merges = moptions.max_successive_merges; + cf_opts->strict_max_successive_merges = moptions.strict_max_successive_merges; cf_opts->inplace_update_num_locks = moptions.inplace_update_num_locks; cf_opts->prefix_extractor = moptions.prefix_extractor; cf_opts->disable_auto_flush = moptions.disable_auto_flush; @@ -233,8 +234,6 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, moptions.level0_slowdown_writes_trigger; cf_opts->level0_stop_writes_trigger = moptions.level0_stop_writes_trigger; cf_opts->max_compaction_bytes = moptions.max_compaction_bytes; - cf_opts->ignore_max_compaction_bytes_for_input = - moptions.ignore_max_compaction_bytes_for_input; cf_opts->target_file_size_base = moptions.target_file_size_base; cf_opts->target_file_size_multiplier = moptions.target_file_size_multiplier; cf_opts->max_bytes_for_level_base = moptions.max_bytes_for_level_base; @@ -270,8 +269,6 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, // Misc options cf_opts->max_sequential_skip_in_iterations = moptions.max_sequential_skip_in_iterations; - cf_opts->check_flush_compaction_key_order = - moptions.check_flush_compaction_key_order; cf_opts->paranoid_file_checks = moptions.paranoid_file_checks; cf_opts->report_bg_io_stats = moptions.report_bg_io_stats; cf_opts->compression = moptions.compression; @@ -281,7 +278,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, cf_opts->sample_for_compression = moptions.sample_for_compression; cf_opts->compression_per_level = moptions.compression_per_level; cf_opts->last_level_temperature = moptions.last_level_temperature; - cf_opts->bottommost_temperature = moptions.last_level_temperature; + cf_opts->default_write_temperature = moptions.default_write_temperature; cf_opts->memtable_max_range_deletions = moptions.memtable_max_range_deletions; } @@ -308,8 +305,6 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, cf_opts->bloom_locality = ioptions.bloom_locality; cf_opts->level_compaction_dynamic_level_bytes = ioptions.level_compaction_dynamic_level_bytes; - cf_opts->level_compaction_dynamic_file_size = - ioptions.level_compaction_dynamic_file_size; cf_opts->num_levels = ioptions.num_levels; cf_opts->optimize_filters_for_hits = ioptions.optimize_filters_for_hits; cf_opts->force_consistency_checks = ioptions.force_consistency_checks; @@ -918,7 +913,7 @@ Status OptionTypeInfo::Parse(const ConfigOptions& config_options, ConfigOptions copy = config_options; copy.ignore_unknown_options = false; copy.invoke_prepare_options = false; - if (opt_value.find("=") != std::string::npos) { + if (opt_value.find('=') != std::string::npos) { return config->ConfigureFromString(copy, opt_value); } else { return config->ConfigureOption(copy, opt_name, opt_value); @@ -1053,7 +1048,7 @@ Status OptionTypeInfo::Serialize(const ConfigOptions& config_options, } std::string value = custom->ToString(embedded); if (!embedded.mutable_options_only || - value.find("=") != std::string::npos) { + value.find('=') != std::string::npos) { *opt_value = value; } else { *opt_value = ""; @@ -1429,7 +1424,7 @@ const OptionTypeInfo* OptionTypeInfo::Find( *elem_name = opt_name; // Return the name return &(iter->second); // Return the contents of the iterator } else { - auto idx = opt_name.find("."); // Look for a separator + auto idx = opt_name.find('.'); // Look for a separator if (idx > 0 && idx != std::string::npos) { // We found a separator auto siter = opt_map.find(opt_name.substr(0, idx)); // Look for the short name diff --git a/options/options_helper.h b/options/options_helper.h index 76e312a63cf..679a1a7eeda 100644 --- a/options/options_helper.h +++ b/options/options_helper.h @@ -65,9 +65,8 @@ std::unique_ptr CFOptionsAsConfigurable( const ColumnFamilyOptions& opts, const std::unordered_map* opt_map = nullptr); -extern Status StringToMap( - const std::string& opts_str, - std::unordered_map* opts_map); +Status StringToMap(const std::string& opts_str, + std::unordered_map* opts_map); struct OptionsHelper { static const std::string kCFOptionsName /*= "ColumnFamilyOptions"*/; diff --git a/options/options_parser.cc b/options/options_parser.cc index a8c855d6e22..ec32f764472 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -35,7 +35,8 @@ static const std::string option_file_header = "#\n" "\n"; -Status PersistRocksDBOptions(const DBOptions& db_opt, +Status PersistRocksDBOptions(const WriteOptions& write_options, + const DBOptions& db_opt, const std::vector& cf_names, const std::vector& cf_opts, const std::string& file_name, FileSystem* fs) { @@ -48,11 +49,12 @@ Status PersistRocksDBOptions(const DBOptions& db_opt, if (db_opt.log_readahead_size > 0) { config_options.file_readahead_size = db_opt.log_readahead_size; } - return PersistRocksDBOptions(config_options, db_opt, cf_names, cf_opts, - file_name, fs); + return PersistRocksDBOptions(write_options, config_options, db_opt, cf_names, + cf_opts, file_name, fs); } -Status PersistRocksDBOptions(const ConfigOptions& config_options_in, +Status PersistRocksDBOptions(const WriteOptions& write_options, + const ConfigOptions& config_options_in, const DBOptions& db_opt, const std::vector& cf_names, const std::vector& cf_opts, @@ -79,62 +81,70 @@ Status PersistRocksDBOptions(const ConfigOptions& config_options_in, std::string options_file_content; - s = writable->Append( - option_file_header + "[" + opt_section_titles[kOptionSectionVersion] + - "]\n" - " rocksdb_version=" + - std::to_string(ROCKSDB_MAJOR) + "." + std::to_string(ROCKSDB_MINOR) + - "." + std::to_string(ROCKSDB_PATCH) + "\n"); + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + s = writable->Append(opts, option_file_header + "[" + + opt_section_titles[kOptionSectionVersion] + + "]\n" + " rocksdb_version=" + + std::to_string(ROCKSDB_MAJOR) + "." + + std::to_string(ROCKSDB_MINOR) + "." + + std::to_string(ROCKSDB_PATCH) + "\n"); + } if (s.ok()) { s = writable->Append( + opts, " options_file_version=" + std::to_string(ROCKSDB_OPTION_FILE_MAJOR) + - "." + std::to_string(ROCKSDB_OPTION_FILE_MINOR) + "\n"); + "." + std::to_string(ROCKSDB_OPTION_FILE_MINOR) + "\n"); } if (s.ok()) { - s = writable->Append("\n[" + opt_section_titles[kOptionSectionDBOptions] + - "]\n "); + s = writable->Append( + opts, "\n[" + opt_section_titles[kOptionSectionDBOptions] + "]\n "); } if (s.ok()) { s = GetStringFromDBOptions(config_options, db_opt, &options_file_content); } if (s.ok()) { - s = writable->Append(options_file_content + "\n"); + s = writable->Append(opts, options_file_content + "\n"); } for (size_t i = 0; s.ok() && i < cf_opts.size(); ++i) { // CFOptions section - s = writable->Append("\n[" + opt_section_titles[kOptionSectionCFOptions] + - " \"" + EscapeOptionString(cf_names[i]) + "\"]\n "); + s = writable->Append( + opts, "\n[" + opt_section_titles[kOptionSectionCFOptions] + " \"" + + EscapeOptionString(cf_names[i]) + "\"]\n "); if (s.ok()) { s = GetStringFromColumnFamilyOptions(config_options, cf_opts[i], &options_file_content); } if (s.ok()) { - s = writable->Append(options_file_content + "\n"); + s = writable->Append(opts, options_file_content + "\n"); } // TableOptions section auto* tf = cf_opts[i].table_factory.get(); if (tf != nullptr) { if (s.ok()) { s = writable->Append( - "[" + opt_section_titles[kOptionSectionTableOptions] + tf->Name() + - " \"" + EscapeOptionString(cf_names[i]) + "\"]\n "); + opts, "[" + opt_section_titles[kOptionSectionTableOptions] + + tf->Name() + " \"" + EscapeOptionString(cf_names[i]) + + "\"]\n "); } if (s.ok()) { options_file_content.clear(); s = tf->GetOptionString(config_options, &options_file_content); } if (s.ok()) { - s = writable->Append(options_file_content + "\n"); + s = writable->Append(opts, options_file_content + "\n"); } } } if (s.ok()) { - s = writable->Sync(true /* use_fsync */); + s = writable->Sync(opts, true /* use_fsync */); } if (s.ok()) { - s = writable->Close(); + s = writable->Close(opts); } TEST_SYNC_POINT("PersistRocksDBOptions:written"); if (s.ok()) { @@ -179,8 +189,8 @@ Status RocksDBOptionsParser::ParseSection(OptionSection* section, *section = kOptionSectionUnknown; // A section is of the form [ ""], where // "" is optional. - size_t arg_start_pos = line.find("\""); - size_t arg_end_pos = line.rfind("\""); + size_t arg_start_pos = line.find('\"'); + size_t arg_end_pos = line.rfind('\"'); // The following if-then check tries to identify whether the input // section has the optional section argument. if (arg_start_pos != std::string::npos && arg_start_pos != arg_end_pos) { @@ -224,7 +234,7 @@ Status RocksDBOptionsParser::ParseStatement(std::string* name, std::string* value, const std::string& line, const int line_num) { - size_t eq_pos = line.find("="); + size_t eq_pos = line.find('='); if (eq_pos == std::string::npos) { return InvalidArgument(line_num, "A valid statement must have a '='."); } @@ -733,4 +743,3 @@ Status RocksDBOptionsParser::VerifyTableFactory( return Status::OK(); } } // namespace ROCKSDB_NAMESPACE - diff --git a/options/options_parser.h b/options/options_parser.h index 4268051f340..e702c9f4999 100644 --- a/options/options_parser.h +++ b/options/options_parser.h @@ -32,11 +32,13 @@ enum OptionSection : char { static const std::string opt_section_titles[] = { "Version", "DBOptions", "CFOptions", "TableOptions/", "Unknown"}; -Status PersistRocksDBOptions(const DBOptions& db_opt, +Status PersistRocksDBOptions(const WriteOptions& write_options, + const DBOptions& db_opt, const std::vector& cf_names, const std::vector& cf_opts, const std::string& file_name, FileSystem* fs); -Status PersistRocksDBOptions(const ConfigOptions& config_options, +Status PersistRocksDBOptions(const WriteOptions& write_options, + const ConfigOptions& config_options, const DBOptions& db_opt, const std::vector& cf_names, const std::vector& cf_opts, diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 73548304e5b..440aa7788ee 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -157,8 +157,6 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { // GetBlockBasedTableOptionsFromString(). bbto = new (bbto_ptr) BlockBasedTableOptions(); FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded); - // This option is not setable: - bbto->use_delta_encoding = true; char* new_bbto_ptr = new char[sizeof(BlockBasedTableOptions)]; BlockBasedTableOptions* new_bbto = @@ -191,6 +189,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { "metadata_block_size=1024;" "partition_filters=false;" "optimize_filters_for_memory=true;" + "use_delta_encoding=true;" "index_block_restart_interval=4;" "filter_policy=bloomfilter:4:true;whole_key_filtering=1;detect_filter_" "construct_corruption=false;" @@ -345,7 +344,6 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "enable_write_thread_adaptive_yield=true;" "write_thread_slow_yield_usec=5;" "write_thread_max_yield_usec=1000;" - "access_hint_on_compaction_start=NONE;" "info_log_level=DEBUG_LEVEL;" "dump_malloc_stats=false;" "allow_2pc=false;" @@ -495,6 +493,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "target_file_size_base=4294976376;" "memtable_huge_page_size=2557;" "max_successive_merges=5497;" + "strict_max_successive_merges=true;" "max_sequential_skip_in_iterations=4294971408;" "arena_block_size=1893;" "target_file_size_multiplier=35;" @@ -556,6 +555,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "prepopulate_blob_cache=kDisable;" "bottommost_temperature=kWarm;" "last_level_temperature=kWarm;" + "default_write_temperature=kCold;" "default_temperature=kHot;" "preclude_last_level_data_seconds=86400;" "preserve_internal_time_seconds=86400;" diff --git a/options/options_test.cc b/options/options_test.cc index af031422fa1..90eda04aa58 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -49,7 +49,7 @@ class OptionsTest : public testing::Test {}; class UnregisteredTableFactory : public TableFactory { public: - UnregisteredTableFactory() {} + UnregisteredTableFactory() = default; const char* Name() const override { return "Unregistered"; } using TableFactory::NewTableReader; Status NewTableReader(const ReadOptions&, const TableReaderOptions&, @@ -115,6 +115,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"memtable_huge_page_size", "28"}, {"bloom_locality", "29"}, {"max_successive_merges", "30"}, + {"strict_max_successive_merges", "true"}, {"min_partial_merge_operands", "31"}, {"prefix_extractor", "fixed:31"}, {"experimental_mempurge_threshold", "0.003"}, @@ -131,6 +132,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"blob_file_starting_level", "1"}, {"prepopulate_blob_cache", "kDisable"}, {"last_level_temperature", "kWarm"}, + {"default_write_temperature", "kCold"}, {"default_temperature", "kHot"}, {"persist_user_defined_timestamps", "true"}, {"memtable_max_range_deletions", "0"}, @@ -271,6 +273,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U); ASSERT_EQ(new_cf_opt.bloom_locality, 29U); ASSERT_EQ(new_cf_opt.max_successive_merges, 30U); + ASSERT_EQ(new_cf_opt.strict_max_successive_merges, true); ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr); ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true); ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31"); @@ -286,9 +289,9 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.blob_file_starting_level, 1); ASSERT_EQ(new_cf_opt.prepopulate_blob_cache, PrepopulateBlobCache::kDisable); ASSERT_EQ(new_cf_opt.last_level_temperature, Temperature::kWarm); - ASSERT_EQ(new_cf_opt.bottommost_temperature, Temperature::kWarm); ASSERT_EQ(new_cf_opt.disable_auto_flush, false); ASSERT_EQ(new_cf_opt.disable_write_stall, false); + ASSERT_EQ(new_cf_opt.default_write_temperature, Temperature::kCold); ASSERT_EQ(new_cf_opt.default_temperature, Temperature::kHot); ASSERT_EQ(new_cf_opt.persist_user_defined_timestamps, true); ASSERT_EQ(new_cf_opt.memtable_max_range_deletions, 0); @@ -1891,7 +1894,7 @@ TEST_F(OptionsTest, StringToMapRandomTest) { "a={aa={};tt={xxx={}}};c=defff;d={{}yxx{}3{xx}}", "abc={{}{}{}{{{}}}{{}{}{}{}{}{}{}"}; - for (std::string base : bases) { + for (const std::string& base : bases) { for (int rand_seed = 301; rand_seed < 401; rand_seed++) { Random rnd(rand_seed); for (int attempt = 0; attempt < 10; attempt++) { @@ -1912,7 +1915,7 @@ TEST_F(OptionsTest, StringToMapRandomTest) { for (int rand_seed = 301; rand_seed < 1301; rand_seed++) { Random rnd(rand_seed); int len = rnd.Uniform(30); - std::string str = ""; + std::string str; for (int attempt = 0; attempt < len; attempt++) { // Add a random character size_t pos = static_cast( @@ -2336,6 +2339,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"memtable_huge_page_size", "28"}, {"bloom_locality", "29"}, {"max_successive_merges", "30"}, + {"strict_max_successive_merges", "true"}, {"min_partial_merge_operands", "31"}, {"prefix_extractor", "fixed:31"}, {"experimental_mempurge_threshold", "0.003"}, @@ -2351,6 +2355,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"blob_file_starting_level", "1"}, {"prepopulate_blob_cache", "kDisable"}, {"last_level_temperature", "kWarm"}, + {"default_write_temperature", "kCold"}, {"default_temperature", "kHot"}, {"persist_user_defined_timestamps", "true"}, {"memtable_max_range_deletions", "0"}, @@ -2454,7 +2459,6 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13); ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U); ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true); - ASSERT_EQ(new_cf_opt.level_compaction_dynamic_file_size, true); ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0); ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U); ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[0], 16); @@ -2488,6 +2492,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U); ASSERT_EQ(new_cf_opt.bloom_locality, 29U); ASSERT_EQ(new_cf_opt.max_successive_merges, 30U); + ASSERT_EQ(new_cf_opt.strict_max_successive_merges, true); ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr); ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true); ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31"); @@ -2503,7 +2508,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.blob_file_starting_level, 1); ASSERT_EQ(new_cf_opt.prepopulate_blob_cache, PrepopulateBlobCache::kDisable); ASSERT_EQ(new_cf_opt.last_level_temperature, Temperature::kWarm); - ASSERT_EQ(new_cf_opt.bottommost_temperature, Temperature::kWarm); + ASSERT_EQ(new_cf_opt.default_write_temperature, Temperature::kCold); ASSERT_EQ(new_cf_opt.default_temperature, Temperature::kHot); ASSERT_EQ(new_cf_opt.persist_user_defined_timestamps, true); ASSERT_EQ(new_cf_opt.memtable_max_range_deletions, 0); @@ -3557,7 +3562,7 @@ TEST_F(OptionsParserTest, ParseVersion) { "3..2", ".", ".1.2", // must have at least one digit before each dot "1.2.", "1.", "2.34."}; // must have at least one digit after each dot - for (auto iv : invalid_versions) { + for (const auto& iv : invalid_versions) { snprintf(buffer, kLength - 1, file_template.c_str(), iv.c_str()); parser.Reset(); @@ -3567,7 +3572,7 @@ TEST_F(OptionsParserTest, ParseVersion) { const std::vector valid_versions = { "1.232", "100", "3.12", "1", "12.3 ", " 1.25 "}; - for (auto vv : valid_versions) { + for (const auto& vv : valid_versions) { snprintf(buffer, kLength - 1, file_template.c_str(), vv.c_str()); parser.Reset(); ASSERT_OK(fs_->WriteToNewFile(vv, buffer)); @@ -3675,8 +3680,8 @@ TEST_F(OptionsParserTest, Readahead) { std::vector cf_names = {"default", one_mb_string}; const std::string kOptionsFileName = "test-persisted-options.ini"; - ASSERT_OK(PersistRocksDBOptions(base_db_opt, cf_names, base_cf_opts, - kOptionsFileName, fs_.get())); + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), base_db_opt, cf_names, + base_cf_opts, kOptionsFileName, fs_.get())); uint64_t file_size = 0; ASSERT_OK( @@ -3750,8 +3755,8 @@ TEST_F(OptionsParserTest, DumpAndParse) { const std::string kOptionsFileName = "test-persisted-options.ini"; // Use default for escaped(true), unknown(false) and check (exact) ConfigOptions config_options; - ASSERT_OK(PersistRocksDBOptions(base_db_opt, cf_names, base_cf_opts, - kOptionsFileName, fs_.get())); + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), base_db_opt, cf_names, + base_cf_opts, kOptionsFileName, fs_.get())); RocksDBOptionsParser parser; ASSERT_OK(parser.Parse(config_options, kOptionsFileName, fs_.get())); @@ -3811,9 +3816,9 @@ TEST_F(OptionsParserTest, DifferentDefault) { ColumnFamilyOptions cf_univ_opts; cf_univ_opts.OptimizeUniversalStyleCompaction(); - ASSERT_OK(PersistRocksDBOptions(DBOptions(), {"default", "universal"}, - {cf_level_opts, cf_univ_opts}, - kOptionsFileName, fs_.get())); + ASSERT_OK(PersistRocksDBOptions( + WriteOptions(), DBOptions(), {"default", "universal"}, + {cf_level_opts, cf_univ_opts}, kOptionsFileName, fs_.get())); RocksDBOptionsParser parser; ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, @@ -3956,8 +3961,8 @@ class OptionsSanityCheckTest : public OptionsParserTest, if (!s.ok()) { return s; } - return PersistRocksDBOptions(db_opts, {"default"}, {cf_opts}, - kOptionsFileName, fs_.get()); + return PersistRocksDBOptions(WriteOptions(), db_opts, {"default"}, + {cf_opts}, kOptionsFileName, fs_.get()); } Status PersistCFOptions(const ColumnFamilyOptions& cf_opts) { @@ -4646,42 +4651,42 @@ TEST_F(OptionTypeInfoTest, TestCustomEnum) { TEST_F(OptionTypeInfoTest, TestBuiltinEnum) { ConfigOptions config_options; - for (auto iter : OptionsHelper::compaction_style_string_map) { + for (const auto& iter : OptionsHelper::compaction_style_string_map) { CompactionStyle e1, e2; TestParseAndCompareOption(config_options, OptionTypeInfo(0, OptionType::kCompactionStyle), "CompactionStyle", iter.first, &e1, &e2); ASSERT_EQ(e1, iter.second); } - for (auto iter : OptionsHelper::compaction_pri_string_map) { + for (const auto& iter : OptionsHelper::compaction_pri_string_map) { CompactionPri e1, e2; TestParseAndCompareOption(config_options, OptionTypeInfo(0, OptionType::kCompactionPri), "CompactionPri", iter.first, &e1, &e2); ASSERT_EQ(e1, iter.second); } - for (auto iter : OptionsHelper::compression_type_string_map) { + for (const auto& iter : OptionsHelper::compression_type_string_map) { CompressionType e1, e2; TestParseAndCompareOption(config_options, OptionTypeInfo(0, OptionType::kCompressionType), "CompressionType", iter.first, &e1, &e2); ASSERT_EQ(e1, iter.second); } - for (auto iter : OptionsHelper::compaction_stop_style_string_map) { + for (const auto& iter : OptionsHelper::compaction_stop_style_string_map) { CompactionStopStyle e1, e2; TestParseAndCompareOption( config_options, OptionTypeInfo(0, OptionType::kCompactionStopStyle), "CompactionStopStyle", iter.first, &e1, &e2); ASSERT_EQ(e1, iter.second); } - for (auto iter : OptionsHelper::checksum_type_string_map) { + for (const auto& iter : OptionsHelper::checksum_type_string_map) { ChecksumType e1, e2; TestParseAndCompareOption(config_options, OptionTypeInfo(0, OptionType::kChecksumType), "CheckSumType", iter.first, &e1, &e2); ASSERT_EQ(e1, iter.second); } - for (auto iter : OptionsHelper::encoding_type_string_map) { + for (const auto& iter : OptionsHelper::encoding_type_string_map) { EncodingType e1, e2; TestParseAndCompareOption(config_options, OptionTypeInfo(0, OptionType::kEncodingType), diff --git a/port/lang.h b/port/lang.h index a4201ca3b28..e9d68bd7823 100644 --- a/port/lang.h +++ b/port/lang.h @@ -72,7 +72,7 @@ constexpr bool kMustFreeHeapAllocations = false; // Compile-time CPU feature testing compatibility // // A way to be extra sure these defines have been included. -#define ASSERT_FEATURE_COMPAT_HEADER() /* empty */ +#define ASSERT_FEATURE_COMPAT_HEADER() static_assert(true, "Semicolon required") /* empty */ // MSVC doesn't support the same defines that gcc and clang provide // but does some like __AVX__. Here we can infer some features from others. diff --git a/port/port_example.h b/port/port_example.h index 2a19ffee055..f9e94d00f86 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -72,20 +72,20 @@ class CondVar { // port::InitOnce(&init_control, &Initializer); using OnceType = intptr_t; #define LEVELDB_ONCE_INIT 0 -extern void InitOnce(port::OnceType*, void (*initializer)()); +void InitOnce(port::OnceType*, void (*initializer)()); // ------------------ Compression ------------------- // Store the snappy compression of "input[0,input_length-1]" in *output. // Returns false if snappy is not supported by this port. -extern bool Snappy_Compress(const char* input, size_t input_length, - std::string* output); +bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); // If input[0,input_length-1] looks like a valid snappy compressed // buffer, store the size of the uncompressed data in *result and // return true. Else return false. -extern bool Snappy_GetUncompressedLength(const char* input, size_t length, - size_t* result); +bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result); // Attempt to snappy uncompress input[0,input_length-1] into *output. // Returns true if successful, false if the input is invalid lightweight @@ -94,8 +94,8 @@ extern bool Snappy_GetUncompressedLength(const char* input, size_t length, // REQUIRES: at least the first "n" bytes of output[] must be writable // where "n" is the result of a successful call to // Snappy_GetUncompressedLength. -extern bool Snappy_Uncompress(const char* input_data, size_t input_length, - char* output); +bool Snappy_Uncompress(const char* input_data, size_t input_length, + char* output); } // namespace port } // namespace ROCKSDB_NAMESPACE diff --git a/port/port_posix.cc b/port/port_posix.cc index 749ad5d607d..7042a710dc8 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -11,20 +11,20 @@ #include "port/port_posix.h" -#include +#include #if defined(__i386__) || defined(__x86_64__) #include #endif -#include #include -#include -#include -#include #include #include #include +#include +#include +#include #include +#include #include #include @@ -41,9 +41,9 @@ namespace ROCKSDB_NAMESPACE { // build environment then this happens automatically; otherwise it's up to the // consumer to define the identifier. #ifdef ROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX -extern const bool kDefaultToAdaptiveMutex = true; +const bool kDefaultToAdaptiveMutex = true; #else -extern const bool kDefaultToAdaptiveMutex = false; +const bool kDefaultToAdaptiveMutex = false; #endif namespace port { diff --git a/port/port_posix.h b/port/port_posix.h index 95641c0c54b..386be532cfd 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -179,11 +179,11 @@ static inline void AsmVolatilePause() { } // Returns -1 if not available on this platform -extern int PhysicalCoreID(); +int PhysicalCoreID(); using OnceType = pthread_once_t; #define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT -extern void InitOnce(OnceType* once, void (*initializer)()); +void InitOnce(OnceType* once, void (*initializer)()); #ifndef CACHE_LINE_SIZE // To test behavior with non-native cache line size, e.g. for @@ -211,9 +211,9 @@ extern void InitOnce(OnceType* once, void (*initializer)()); static_assert((CACHE_LINE_SIZE & (CACHE_LINE_SIZE - 1)) == 0, "Cache line size must be a power of 2 number of bytes"); -extern void* cacheline_aligned_alloc(size_t size); +void* cacheline_aligned_alloc(size_t size); -extern void cacheline_aligned_free(void* memblock); +void cacheline_aligned_free(void* memblock); #if defined(__aarch64__) // __builtin_prefetch(..., 1) turns into a prefetch into prfm pldl3keep. On @@ -226,15 +226,15 @@ extern void cacheline_aligned_free(void* memblock); #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality) #endif -extern void Crash(const std::string& srcfile, int srcline); +void Crash(const std::string& srcfile, int srcline); -extern int GetMaxOpenFiles(); +int GetMaxOpenFiles(); extern const size_t kPageSize; using ThreadId = pid_t; -extern void SetCpuPriority(ThreadId id, CpuPriority priority); +void SetCpuPriority(ThreadId id, CpuPriority priority); int64_t GetProcessID(); diff --git a/port/stack_trace.cc b/port/stack_trace.cc index a5a6d2e77c8..f4909f91d5d 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -25,12 +25,14 @@ void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) { #include #include -#include -#include -#include -#include +#include #include +#include +#include +#include +#include + #ifdef OS_OPENBSD #include #include @@ -48,10 +50,12 @@ void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) { #endif // GLIBC version #endif // OS_LINUX +#include +#include + #include "port/lang.h" -namespace ROCKSDB_NAMESPACE { -namespace port { +namespace ROCKSDB_NAMESPACE::port { namespace { @@ -252,9 +256,9 @@ void PrintStack(int first_frames_to_skip) { if (lldb_stack_trace) { fprintf(stderr, "Invoking LLDB for stack trace...\n"); - // Skip top ~8 frames here in PrintStack + // Skip top ~4 frames here in PrintStack auto bt_in_lldb = - "script -l python -- for f in lldb.thread.frames[8:]: print(f)"; + "script -l python -- for f in lldb.thread.frames[4:]: print(f)"; execlp(/*cmd in PATH*/ "lldb", /*arg0*/ "lldb", "-p", attach_pid_str, "-b", "-Q", "-o", GetLldbScriptSelectThread(attach_tid), "-o", bt_in_lldb, (char*)nullptr); @@ -311,28 +315,85 @@ void* SaveStack(int* num_frames, int first_frames_to_skip) { return callstack; } +static std::atomic g_thread_handling_stack_trace{0}; +static int g_recursion_count = 0; +static std::atomic g_at_exit_called{false}; + static void StackTraceHandler(int sig) { - // reset to default handler - signal(sig, SIG_DFL); fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); - // skip the top three signal handler related frames - PrintStack(3); + // Crude recursive mutex with no signal-unsafe system calls, to avoid + // re-entrance from multiple threads and avoid core dumping while trying + // to print the stack trace. + uint64_t tid = 0; + { + const auto ptid = pthread_self(); + // pthread_t is an opaque type + memcpy(&tid, &ptid, std::min(sizeof(tid), sizeof(ptid))); + // Essentially ensure non-zero + ++tid; + } + for (;;) { + uint64_t expected = 0; + if (g_thread_handling_stack_trace.compare_exchange_strong(expected, tid)) { + // Acquired mutex + g_recursion_count = 0; + break; + } + if (expected == tid) { + ++g_recursion_count; + fprintf(stderr, "Recursive call to stack trace handler (%d)\n", + g_recursion_count); + break; + } + // Sleep before trying again + usleep(1000); + } - // Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of - // a signal" have failed, so just warn the user about them. + if (g_recursion_count > 2) { + // Give up after too many recursions + fprintf(stderr, "Too many recursive calls to stack trace handler (%d)\n", + g_recursion_count); + } else { + if (g_at_exit_called.load(std::memory_order_acquire)) { + fprintf(stderr, "In a race with process already exiting...\n"); + } + + // skip the top three signal handler related frames + PrintStack(3); + + // Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of + // a signal" have failed, so just warn the user about them. #ifdef __SANITIZE_THREAD__ - fprintf(stderr, - "==> NOTE: any above warnings about \"signal-unsafe call\" are\n" - "==> ignorable, as they are expected when generating a stack\n" - "==> trace because of a signal under TSAN. Consider why the\n" - "==> signal was generated to begin with, and the stack trace\n" - "==> in the TSAN warning can be useful for that. (The stack\n" - "==> trace printed by the signal handler is likely obscured\n" - "==> by TSAN output.)\n"); + fprintf(stderr, + "==> NOTE: any above warnings about \"signal-unsafe call\" are\n" + "==> ignorable, as they are expected when generating a stack\n" + "==> trace because of a signal under TSAN. Consider why the\n" + "==> signal was generated to begin with, and the stack trace\n" + "==> in the TSAN warning can be useful for that. (The stack\n" + "==> trace printed by the signal handler is likely obscured\n" + "==> by TSAN output.)\n"); #endif + } + // reset to default handler + signal(sig, SIG_DFL); // re-signal to default handler (so we still get core dump if needed...) raise(sig); + + // release the mutex, in case this is somehow recoverable + if (g_recursion_count > 0) { + --g_recursion_count; + } else { + g_thread_handling_stack_trace.store(0, std::memory_order_release); + } +} + +static void AtExit() { + // wait for stack trace handler to finish, if needed + while (g_thread_handling_stack_trace.load(std::memory_order_acquire)) { + usleep(1000); + } + g_at_exit_called.store(true, std::memory_order_release); } void InstallStackTraceHandler() { @@ -342,6 +403,7 @@ void InstallStackTraceHandler() { signal(SIGSEGV, StackTraceHandler); signal(SIGBUS, StackTraceHandler); signal(SIGABRT, StackTraceHandler); + atexit(AtExit); // Allow ouside debugger to attach, even with Yama security restrictions. // This is needed even outside of PrintStack() so that external mechanisms // can dump stacks if they suspect that a test has hung. @@ -350,7 +412,6 @@ void InstallStackTraceHandler() { #endif } -} // namespace port -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::port #endif diff --git a/port/win/env_win.cc b/port/win/env_win.cc index 2262eb59c41..1ea237b9ba8 100644 --- a/port/win/env_win.cc +++ b/port/win/env_win.cc @@ -1286,8 +1286,7 @@ struct StartThreadState { }; void* StartThreadWrapper(void* arg) { - std::unique_ptr state( - reinterpret_cast(arg)); + std::unique_ptr state(static_cast(arg)); state->user_function(state->arg); return nullptr; } diff --git a/port/win/io_win.cc b/port/win/io_win.cc index 4fa735518f7..2ba64b32655 100644 --- a/port/win/io_win.cc +++ b/port/win/io_win.cc @@ -230,7 +230,7 @@ IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n, } else if (offset + n > length_) { n = length_ - static_cast(offset); } - *result = Slice(reinterpret_cast(mapped_region_) + offset, n); + *result = Slice(static_cast(mapped_region_) + offset, n); return s; } @@ -327,9 +327,9 @@ IOStatus WinMmapFile::MapNewRegion(const IOOptions& options, offset.QuadPart = file_offset_; // View must begin at the granularity aligned offset - mapped_begin_ = reinterpret_cast( - MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart, - view_size_, NULL)); + mapped_begin_ = + static_cast(MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, + offset.LowPart, view_size_, NULL)); if (!mapped_begin_) { status = IOErrorFromWindowsError( diff --git a/port/win/io_win.h b/port/win/io_win.h index a4fee8346c4..e1a6197ce86 100644 --- a/port/win/io_win.h +++ b/port/win/io_win.h @@ -125,9 +125,7 @@ class WinSequentialFile : protected WinFileData, public FSSequentialFile { IOStatus InvalidateCache(size_t offset, size_t length) override; - virtual bool use_direct_io() const override { - return WinFileData::use_direct_io(); - } + bool use_direct_io() const override { return WinFileData::use_direct_io(); } }; // mmap() based random-access @@ -151,9 +149,9 @@ class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile { Slice* result, char* scratch, IODebugContext* dbg) const override; - virtual IOStatus InvalidateCache(size_t offset, size_t length) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; - virtual size_t GetUniqueId(char* id, size_t max_size) const override; + size_t GetUniqueId(char* id, size_t max_size) const override; }; // We preallocate and use memcpy to append new @@ -243,7 +241,7 @@ class WinMmapFile : private WinFileData, public FSWritableFile { IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, IODebugContext* dbg) override; - virtual size_t GetUniqueId(char* id, size_t max_size) const override; + size_t GetUniqueId(char* id, size_t max_size) const override; }; class WinRandomAccessImpl { @@ -287,15 +285,13 @@ class WinRandomAccessFile Slice* result, char* scratch, IODebugContext* dbg) const override; - virtual size_t GetUniqueId(char* id, size_t max_size) const override; + size_t GetUniqueId(char* id, size_t max_size) const override; - virtual bool use_direct_io() const override { - return WinFileData::use_direct_io(); - } + bool use_direct_io() const override { return WinFileData::use_direct_io(); } IOStatus InvalidateCache(size_t offset, size_t length) override; - virtual size_t GetRequiredBufferAlignment() const override; + size_t GetRequiredBufferAlignment() const override; }; // This is a sequential write class. It has been mimicked (as others) after @@ -399,20 +395,20 @@ class WinWritableFile : private WinFileData, IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; - virtual bool IsSyncThreadSafe() const override; + bool IsSyncThreadSafe() const override; // Indicates if the class makes use of direct I/O // Use PositionedAppend - virtual bool use_direct_io() const override; + bool use_direct_io() const override; - virtual size_t GetRequiredBufferAlignment() const override; + size_t GetRequiredBufferAlignment() const override; uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, IODebugContext* dbg) override; - virtual size_t GetUniqueId(char* id, size_t max_size) const override; + size_t GetUniqueId(char* id, size_t max_size) const override; }; class WinRandomRWFile : private WinFileData, @@ -427,11 +423,11 @@ class WinRandomRWFile : private WinFileData, // Indicates if the class makes use of direct I/O // If false you must pass aligned buffer to Write() - virtual bool use_direct_io() const override; + bool use_direct_io() const override; // Use the returned alignment value to allocate aligned // buffer for Write() when use_direct_io() returns true - virtual size_t GetRequiredBufferAlignment() const override; + size_t GetRequiredBufferAlignment() const override; // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. // Pass aligned buffer when use_direct_io() returns true. diff --git a/port/win/port_win.cc b/port/win/port_win.cc index 37e8f655ce3..bdfe656018d 100644 --- a/port/win/port_win.cc +++ b/port/win/port_win.cc @@ -36,7 +36,7 @@ namespace ROCKSDB_NAMESPACE { -extern const bool kDefaultToAdaptiveMutex = false; +const bool kDefaultToAdaptiveMutex = false; namespace port { @@ -288,7 +288,8 @@ bool GenerateRfcUuid(std::string* output) { return false; } - // rpc_str is nul-terminated + // rpc_str is nul-terminated. + // reinterpret_cast for possible change between signed/unsigned char. *output = reinterpret_cast(rpc_str); status = RpcStringFreeA(&rpc_str); diff --git a/port/win/port_win.h b/port/win/port_win.h index 4aa10d0052b..54c8f158165 100644 --- a/port/win/port_win.h +++ b/port/win/port_win.h @@ -213,7 +213,7 @@ struct OnceType { }; #define LEVELDB_ONCE_INIT port::OnceType::Init() -extern void InitOnce(OnceType* once, void (*initializer)()); +void InitOnce(OnceType* once, void (*initializer)()); #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 64U @@ -253,7 +253,7 @@ static inline void AsmVolatilePause() { // it would be nice to get "wfe" on ARM here } -extern int PhysicalCoreID(); +int PhysicalCoreID(); // For Thread Local Storage abstraction using pthread_key_t = DWORD; @@ -303,13 +303,13 @@ inline void* pthread_getspecific(pthread_key_t key) { int truncate(const char* path, int64_t length); int Truncate(std::string path, int64_t length); void Crash(const std::string& srcfile, int srcline); -extern int GetMaxOpenFiles(); +int GetMaxOpenFiles(); std::string utf16_to_utf8(const std::wstring& utf16); std::wstring utf8_to_utf16(const std::string& utf8); using ThreadId = int; -extern void SetCpuPriority(ThreadId id, CpuPriority priority); +void SetCpuPriority(ThreadId id, CpuPriority priority); int64_t GetProcessID(); diff --git a/src.mk b/src.mk index 3b8cc56bc3a..238d57bf2d0 100644 --- a/src.mk +++ b/src.mk @@ -92,6 +92,7 @@ LIB_SOURCES = \ db/memtable_list.cc \ db/merge_helper.cc \ db/merge_operator.cc \ + db/multi_cf_iterator.cc \ db/output_validator.cc \ db/periodic_task_scheduler.cc \ db/range_del_aggregator.cc \ @@ -538,6 +539,7 @@ TEST_MAIN_SOURCES = \ db/memtable_list_test.cc \ db/merge_helper_test.cc \ db/merge_test.cc \ + db/multi_cf_iterator_test.cc \ db/obsolete_files_test.cc \ db/options_file_test.cc \ db/perf_context_test.cc \ @@ -695,6 +697,7 @@ JNI_NATIVE_SOURCES = \ java/rocksjni/hyper_clock_cache.cc \ java/rocksjni/iterator.cc \ java/rocksjni/jni_perf_context.cc \ + java/rocksjni/jni_multiget_helpers.cc \ java/rocksjni/jnicallback.cc \ java/rocksjni/loggerjnicallback.cc \ java/rocksjni/lru_cache.cc \ @@ -724,9 +727,11 @@ JNI_NATIVE_SOURCES = \ java/rocksjni/sst_partitioner.cc \ java/rocksjni/statistics.cc \ java/rocksjni/statisticsjni.cc \ + java/rocksjni/stderr_logger.cc \ java/rocksjni/table.cc \ java/rocksjni/table_filter.cc \ java/rocksjni/table_filter_jnicallback.cc \ + java/rocksjni/table_properties_collector_factory.cc \ java/rocksjni/thread_status.cc \ java/rocksjni/trace_writer.cc \ java/rocksjni/trace_writer_jnicallback.cc \ diff --git a/table/adaptive/adaptive_table_factory.cc b/table/adaptive/adaptive_table_factory.cc index 5a573ca992a..f06b265328f 100644 --- a/table/adaptive/adaptive_table_factory.cc +++ b/table/adaptive/adaptive_table_factory.cc @@ -34,12 +34,6 @@ AdaptiveTableFactory::AdaptiveTableFactory( } } -extern const uint64_t kPlainTableMagicNumber; -extern const uint64_t kLegacyPlainTableMagicNumber; -extern const uint64_t kBlockBasedTableMagicNumber; -extern const uint64_t kLegacyBlockBasedTableMagicNumber; -extern const uint64_t kCuckooTableMagicNumber; - Status AdaptiveTableFactory::NewTableReader( const ReadOptions& ro, const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, @@ -112,7 +106,7 @@ std::string AdaptiveTableFactory::GetPrintableOptions() const { return ret; } -extern TableFactory* NewAdaptiveTableFactory( +TableFactory* NewAdaptiveTableFactory( std::shared_ptr table_factory_to_write, std::shared_ptr block_based_table_factory, std::shared_ptr plain_table_factory, diff --git a/table/block_based/block.cc b/table/block_based/block.cc index 9bebdfbdcab..ea4d559a2a4 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -49,8 +49,12 @@ struct DecodeEntry { // Fast path: all three values are encoded in one byte each p += 3; } else { - if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; - if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) { + return nullptr; + } + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) { + return nullptr; + } if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { return nullptr; } @@ -82,8 +86,12 @@ struct CheckAndDecodeEntry { // Fast path: all three values are encoded in one byte each p += 3; } else { - if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; - if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) { + return nullptr; + } + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) { + return nullptr; + } if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { return nullptr; } @@ -113,15 +121,21 @@ struct DecodeKeyV4 { // We need 2 bytes for shared and non_shared size. We also need one more // byte either for value size or the actual value in case of value delta // encoding. - if (limit - p < 3) return nullptr; + if (limit - p < 3) { + return nullptr; + } *shared = reinterpret_cast(p)[0]; *non_shared = reinterpret_cast(p)[1]; if ((*shared | *non_shared) < 128) { // Fast path: all three values are encoded in one byte each p += 2; } else { - if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; - if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) { + return nullptr; + } + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) { + return nullptr; + } } return p; } @@ -140,7 +154,9 @@ struct DecodeEntryV4 { void DataBlockIter::NextImpl() { #ifndef NDEBUG - if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) return; + if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) { + return; + } #endif bool is_shared = false; ParseNextDataKey(&is_shared); @@ -319,11 +335,12 @@ void MetaBlockIter::SeekImpl(const Slice& target) { // target = "seek_user_key @ type | seqno". // // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion, -// kTypeBlobIndex, kTypeWideColumnEntity or kTypeMerge, this function behaves -// identically to Seek(). +// kTypeBlobIndex, kTypeWideColumnEntity, kTypeValuePreferredSeqno or +// kTypeMerge, this function behaves identically to Seek(). // // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion, -// kTypeBlobIndex, kTypeWideColumnEntity, or kTypeMerge: +// kTypeBlobIndex, kTypeWideColumnEntity, kTypeValuePreferredSeqno or +// kTypeMerge: // // If the return value is FALSE, iter location is undefined, and it means: // 1) there is no key in this block falling into the range: @@ -436,7 +453,8 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { value_type != ValueType::kTypeMerge && value_type != ValueType::kTypeSingleDeletion && value_type != ValueType::kTypeBlobIndex && - value_type != ValueType::kTypeWideColumnEntity) { + value_type != ValueType::kTypeWideColumnEntity && + value_type != ValueType::kTypeValuePreferredSeqno) { SeekImpl(target); } @@ -446,7 +464,9 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { void IndexBlockIter::SeekImpl(const Slice& target) { #ifndef NDEBUG - if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) return; + if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) { + return; + } #endif TEST_SYNC_POINT("IndexBlockIter::Seek:0"); PERF_TIMER_GUARD(block_seek_nanos); @@ -560,7 +580,9 @@ void MetaBlockIter::SeekToFirstImpl() { void IndexBlockIter::SeekToFirstImpl() { #ifndef NDEBUG - if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) return; + if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) { + return; + } #endif if (data_ == nullptr) { // Not init yet return; @@ -910,7 +932,9 @@ bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, // Key at "target" is <= "mid". Therefore all blocks // after "mid" are uninteresting. // If there is only one block left, we found it. - if (left == right) break; + if (left == right) { + break; + } right = mid; } } diff --git a/table/block_based/block.h b/table/block_based/block.h index dcd83aa6e4a..439598ba54e 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -340,7 +340,7 @@ class BlockIter : public InternalIteratorBase { return current_ < restarts_; } - virtual void SeekToFirst() override final { + void SeekToFirst() override final { #ifndef NDEBUG if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return; #endif @@ -348,33 +348,33 @@ class BlockIter : public InternalIteratorBase { UpdateKey(); } - virtual void SeekToLast() override final { + void SeekToLast() override final { SeekToLastImpl(); UpdateKey(); } - virtual void Seek(const Slice& target) override final { + void Seek(const Slice& target) override final { SeekImpl(target); UpdateKey(); } - virtual void SeekForPrev(const Slice& target) override final { + void SeekForPrev(const Slice& target) override final { SeekForPrevImpl(target); UpdateKey(); } - virtual void Next() override final { + void Next() override final { NextImpl(); UpdateKey(); } - virtual bool NextAndGetResult(IterateResult* result) override final { + bool NextAndGetResult(IterateResult* result) override final { // This does not need to call `UpdateKey()` as the parent class only has // access to the `UpdateKey()`-invoking functions. return InternalIteratorBase::NextAndGetResult(result); } - virtual void Prev() override final { + void Prev() override final { PrevImpl(); UpdateKey(); } diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index e66c4939a09..eb2ae4ddba9 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -9,10 +9,9 @@ #include "table/block_based/block_based_table_builder.h" -#include -#include - #include +#include +#include #include #include #include @@ -210,7 +209,7 @@ const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull; // But in the foreseeable future, we will add more and more properties that are // specific to block-based table. class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector - : public IntTblPropCollector { + : public InternalTblPropColl { public: explicit BlockBasedTablePropertiesCollector( BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering, @@ -226,12 +225,11 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector return Status::OK(); } - virtual void BlockAdd(uint64_t /* block_uncomp_bytes */, - uint64_t /* block_compressed_bytes_fast */, - uint64_t /* block_compressed_bytes_slow */) override { + void BlockAdd(uint64_t /* block_uncomp_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { // Intentionally left blank. No interest in collecting stats for // blocks. - return; } Status Finish(UserCollectedProperties* properties) override { @@ -266,6 +264,7 @@ struct BlockBasedTableBuilder::Rep { // BEGIN from MutableCFOptions std::shared_ptr prefix_extractor; // END from MutableCFOptions + const WriteOptions write_options; const BlockBasedTableOptions table_options; const InternalKeyComparator& internal_comparator; // Size in bytes for the user-defined timestamps. @@ -354,7 +353,7 @@ struct BlockBasedTableBuilder::Rep { std::string compressed_output; std::unique_ptr flush_block_policy; - std::vector> table_properties_collectors; + std::vector> table_properties_collectors; std::unique_ptr pc_rep; BlockCreateContext create_context; @@ -441,6 +440,7 @@ struct BlockBasedTableBuilder::Rep { WritableFileWriter* f) : ioptions(tbo.ioptions), prefix_extractor(tbo.moptions.prefix_extractor), + write_options(tbo.write_options), table_options(table_opt), internal_comparator(tbo.internal_comparator), ts_sz(tbo.internal_comparator.user_comparator()->timestamp_size()), @@ -480,8 +480,10 @@ struct BlockBasedTableBuilder::Rep { compression_ctxs(tbo.compression_opts.parallel_threads), verify_ctxs(tbo.compression_opts.parallel_threads), verify_dict(), - state((tbo.compression_opts.max_dict_bytes > 0) ? State::kBuffered - : State::kUnbuffered), + state((tbo.compression_opts.max_dict_bytes > 0 && + tbo.compression_type != kNoCompression) + ? State::kBuffered + : State::kUnbuffered), use_delta_encoding_for_index_values(table_opt.format_version >= 4 && !table_opt.block_align), reason(tbo.reason), @@ -575,13 +577,16 @@ struct BlockBasedTableBuilder::Rep { persist_user_defined_timestamps)); } - assert(tbo.int_tbl_prop_collector_factories); - for (auto& factory : *tbo.int_tbl_prop_collector_factories) { + assert(tbo.internal_tbl_prop_coll_factories); + for (auto& factory : *tbo.internal_tbl_prop_coll_factories) { assert(factory); - table_properties_collectors.emplace_back( - factory->CreateIntTblPropCollector(tbo.column_family_id, - tbo.level_at_creation)); + std::unique_ptr collector{ + factory->CreateInternalTblPropColl(tbo.column_family_id, + tbo.level_at_creation)}; + if (collector) { + table_properties_collectors.emplace_back(std::move(collector)); + } } table_properties_collectors.emplace_back( new BlockBasedTablePropertiesCollector( @@ -704,7 +709,7 @@ struct BlockBasedTableBuilder::ParallelCompressionRep { template void Fill(T&& rep) { slot_.push(std::forward(rep)); - }; + } void Take(BlockRep*& rep) { slot_.pop(rep); } private: @@ -985,7 +990,9 @@ BlockBasedTableBuilder::~BlockBasedTableBuilder() { void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { Rep* r = rep_; assert(rep_->state != Rep::State::kClosed); - if (!ok()) return; + if (!ok()) { + return; + } ValueType value_type = ExtractValueType(key); if (IsValueType(value_type)) { #ifndef NDEBUG @@ -1067,14 +1074,26 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->ioptions.logger); } else if (value_type == kTypeRangeDeletion) { - // TODO(yuzhangyu): handle range deletion entries for UDT in memtable only. - r->range_del_block.Add(key, value); + Slice persisted_end = value; + // When timestamps should not be persisted, we physically strip away range + // tombstone end key's user timestamp before passing it along to block + // builder. Physically stripping away start key's user timestamp is + // handled at the block builder level in the same way as the other data + // blocks. + if (r->ts_sz > 0 && !r->persist_user_defined_timestamps) { + persisted_end = StripTimestampFromUserKey(value, r->ts_sz); + } + r->range_del_block.Add(key, persisted_end); // TODO offset passed in is not accurate for parallel compression case NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), r->table_properties_collectors, r->ioptions.logger); } else { assert(false); + r->SetStatus(Status::InvalidArgument( + "BlockBasedBuilder::Add() received a key with invalid value type " + + std::to_string(static_cast(value_type)))); + return; } r->props.num_entries++; @@ -1097,8 +1116,12 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { void BlockBasedTableBuilder::Flush() { Rep* r = rep_; assert(rep_->state != Rep::State::kClosed); - if (!ok()) return; - if (r->data_block.empty()) return; + if (!ok()) { + return; + } + if (r->data_block.empty()) { + return; + } if (r->IsParallelCompressionEnabled() && r->state == Rep::State::kUnbuffered) { r->data_block.Finish(); @@ -1310,6 +1333,13 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( // checksum: uint32 Rep* r = rep_; bool is_data_block = block_type == BlockType::kData; + IOOptions io_options; + IOStatus io_s = + WritableFileWriter::PrepareIOOptions(r->write_options, io_options); + if (!io_s.ok()) { + r->SetIOStatus(io_s); + return; + } // Old, misleading name of this function: WriteRawBlock StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS); const uint64_t offset = r->get_offset(); @@ -1323,7 +1353,7 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( } { - IOStatus io_s = r->file->Append(block_contents); + io_s = r->file->Append(io_options, block_contents); if (!io_s.ok()) { r->SetIOStatus(io_s); return; @@ -1350,7 +1380,7 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( "BlockBasedTableBuilder::WriteMaybeCompressedBlock:TamperWithChecksum", trailer.data()); { - IOStatus io_s = r->file->Append(Slice(trailer.data(), trailer.size())); + io_s = r->file->Append(io_options, Slice(trailer.data(), trailer.size())); if (!io_s.ok()) { r->SetIOStatus(io_s); return; @@ -1387,7 +1417,8 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( (r->alignment - ((block_contents.size() + kBlockTrailerSize) & (r->alignment - 1))) & (r->alignment - 1); - IOStatus io_s = r->file->Pad(pad_bytes); + + io_s = r->file->Pad(io_options, pad_bytes); if (io_s.ok()) { r->set_offset(r->get_offset() + pad_bytes); } else { @@ -1793,7 +1824,14 @@ void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle, r->SetStatus(s); return; } - IOStatus ios = r->file->Append(footer.GetSlice()); + IOOptions io_options; + IOStatus ios = + WritableFileWriter::PrepareIOOptions(r->write_options, io_options); + if (!ios.ok()) { + r->SetIOStatus(ios); + return; + } + ios = r->file->Append(io_options, footer.GetSlice()); if (ios.ok()) { r->set_offset(r->get_offset() + footer.GetSlice().size()); } else { @@ -2081,9 +2119,9 @@ const char* BlockBasedTableBuilder::GetFileChecksumFuncName() const { } } void BlockBasedTableBuilder::SetSeqnoTimeTableProperties( - const std::string& encoded_seqno_to_time_mapping, - uint64_t oldest_ancestor_time) { - rep_->props.seqno_to_time_mapping = encoded_seqno_to_time_mapping; + const SeqnoToTimeMapping& relevant_mapping, uint64_t oldest_ancestor_time) { + assert(rep_->props.seqno_to_time_mapping.empty()); + relevant_mapping.EncodeTo(rep_->props.seqno_to_time_mapping); rep_->props.creation_time = oldest_ancestor_time; } diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h index 3949474c580..f3360f8bcbc 100644 --- a/table/block_based/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -106,9 +106,8 @@ class BlockBasedTableBuilder : public TableBuilder { // Get file checksum function name const char* GetFileChecksumFuncName() const override; - void SetSeqnoTimeTableProperties( - const std::string& encoded_seqno_to_time_mapping, - uint64_t oldest_ancestor_time) override; + void SetSeqnoTimeTableProperties(const SeqnoToTimeMapping& relevant_mapping, + uint64_t oldest_ancestor_time) override; private: bool ok() const { return status().ok(); } diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 25299ecab44..6da594c10d6 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -9,9 +9,8 @@ #include "table/block_based/block_based_table_factory.h" -#include - #include +#include #include #include @@ -309,9 +308,10 @@ static std::unordered_map {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, - // TODO "use_delta_encoding" has not been persisted - - // this may have been an omission, but changing this now might be a - // breaker + {"use_delta_encoding", + {offsetof(struct BlockBasedTableOptions, use_delta_encoding), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"filter_policy", OptionTypeInfo::AsCustomSharedPtr( offsetof(struct BlockBasedTableOptions, filter_policy), diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc index 8107e58f24b..3e2f4cc16fb 100644 --- a/table/block_based/block_based_table_iterator.cc +++ b/table/block_based/block_based_table_iterator.cc @@ -16,25 +16,44 @@ void BlockBasedTableIterator::Seek(const Slice& target) { SeekImpl(&target, true); } +void BlockBasedTableIterator::SeekSecondPass(const Slice* target) { + AsyncInitDataBlock(/*is_first_pass=*/false); + + if (target) { + block_iter_.Seek(*target); + } else { + block_iter_.SeekToFirst(); + } + FindKeyForward(); + + CheckOutOfBound(); + + if (target) { + assert(!Valid() || icomp_.Compare(*target, key()) <= 0); + } +} + void BlockBasedTableIterator::SeekImpl(const Slice* target, bool async_prefetch) { - ResetBlockCacheLookupVar(); bool is_first_pass = !async_read_in_progress_; + + if (!is_first_pass) { + SeekSecondPass(target); + return; + } + + ResetBlockCacheLookupVar(); + bool autotune_readaheadsize = is_first_pass && read_options_.auto_readahead_size && read_options_.iterate_upper_bound; if (autotune_readaheadsize && table_->get_rep()->table_options.block_cache.get() && - !read_options_.async_io && direction_ == IterDirection::kForward) { + direction_ == IterDirection::kForward) { readahead_cache_lookup_ = true; } - // Second pass. - if (async_read_in_progress_) { - AsyncInitDataBlock(false); - } - is_out_of_bound_ = false; is_at_first_key_from_index_ = false; seek_stat_state_ = kNone; @@ -57,7 +76,8 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target, bool need_seek_index = true; // In case of readahead_cache_lookup_, index_iter_ could change to find the - // readahead size in BlockCacheLookupForReadAheadSize so it needs to reseek. + // readahead size in BlockCacheLookupForReadAheadSize so it needs to + // reseek. if (IsIndexAtCurr() && block_iter_points_to_real_block_ && block_iter_.Valid()) { // Reseek. @@ -94,24 +114,8 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target, } } - if (autotune_readaheadsize) { - FindReadAheadSizeUpperBound(); - if (target) { - index_iter_->Seek(*target); - } else { - index_iter_->SeekToFirst(); - } - - // Check for IO error. - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - } - // After reseek, index_iter_ point to the right key i.e. target in // case of readahead_cache_lookup_. So index_iter_ can be used directly. - IndexValue v = index_iter_->value(); const bool same_block = block_iter_points_to_real_block_ && v.handle.offset() == prev_block_offset_; @@ -130,14 +134,12 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target, // Need to use the data block. if (!same_block) { if (read_options_.async_io && async_prefetch) { - if (is_first_pass) { - AsyncInitDataBlock(is_first_pass); - } + AsyncInitDataBlock(/*is_first_pass=*/true); if (async_read_in_progress_) { // Status::TryAgain indicates asynchronous request for retrieval of // data blocks has been submitted. So it should return at this point - // and Seek should be called again to retrieve the requested block and - // execute the remaining code. + // and Seek should be called again to retrieve the requested block + // and execute the remaining code. return; } } else { @@ -286,11 +288,29 @@ bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) { } void BlockBasedTableIterator::Prev() { - // Return Error. - if (readahead_cache_lookup_) { - block_iter_.Invalidate(Status::NotSupported( - "auto tuning of readahead_size is not supported with Prev operation.")); - return; + if (readahead_cache_lookup_ && !IsIndexAtCurr()) { + // In case of readahead_cache_lookup_, index_iter_ has moved forward. So we + // need to reseek the index_iter_ to point to current block by using + // block_iter_'s key. + if (Valid()) { + ResetBlockCacheLookupVar(); + direction_ = IterDirection::kBackward; + Slice last_key = key(); + + index_iter_->Seek(last_key); + is_index_at_curr_block_ = true; + + // Check for IO error. + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + } + + if (!Valid()) { + ResetDataIter(); + return; + } } ResetBlockCacheLookupVar(); @@ -346,7 +366,7 @@ void BlockBasedTableIterator::InitDataBlock() { } else { auto* rep = table_->get_rep(); - std::function readaheadsize_cb = + std::function readaheadsize_cb = nullptr; if (readahead_cache_lookup_) { readaheadsize_cb = std::bind( @@ -364,7 +384,8 @@ void BlockBasedTableIterator::InitDataBlock() { block_prefetcher_.PrefetchIfNeeded( rep, data_block_handle, read_options_.readahead_size, is_for_compaction, - /*no_sequential_checking=*/false, read_options_, readaheadsize_cb); + /*no_sequential_checking=*/false, read_options_, readaheadsize_cb, + read_options_.async_io); Status s; table_->NewDataBlockIterator( @@ -389,10 +410,11 @@ void BlockBasedTableIterator::InitDataBlock() { } void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass) { - BlockHandle data_block_handle = index_iter_->value().handle; + BlockHandle data_block_handle; bool is_for_compaction = lookup_context_.caller == TableReaderCaller::kCompaction; if (is_first_pass) { + data_block_handle = index_iter_->value().handle; if (!block_iter_points_to_real_block_ || data_block_handle.offset() != prev_block_offset_ || // if previous attempt of reading the block missed cache, try again @@ -402,7 +424,7 @@ void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass) { } auto* rep = table_->get_rep(); - std::function readaheadsize_cb = + std::function readaheadsize_cb = nullptr; if (readahead_cache_lookup_) { readaheadsize_cb = std::bind( @@ -423,7 +445,7 @@ void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass) { block_prefetcher_.PrefetchIfNeeded( rep, data_block_handle, read_options_.readahead_size, is_for_compaction, /*no_sequential_checking=*/read_options_.async_io, - read_options_, readaheadsize_cb); + read_options_, readaheadsize_cb, read_options_.async_io); Status s; table_->NewDataBlockIterator( @@ -441,13 +463,30 @@ void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass) { } else { // Second pass will call the Poll to get the data block which has been // requested asynchronously. + bool is_in_cache = false; + + if (DoesContainBlockHandles()) { + data_block_handle = block_handles_.front().handle_; + is_in_cache = block_handles_.front().is_cache_hit_; + } else { + data_block_handle = index_iter_->value().handle; + } + Status s; - table_->NewDataBlockIterator( - read_options_, data_block_handle, &block_iter_, BlockType::kData, - /*get_context=*/nullptr, &lookup_context_, - block_prefetcher_.prefetch_buffer(), - /*for_compaction=*/is_for_compaction, /*async_read=*/false, s, - /*use_block_cache_for_lookup=*/false); + // Initialize Data Block From CacheableEntry. + if (is_in_cache) { + block_iter_.Invalidate(Status::OK()); + table_->NewDataBlockIterator( + read_options_, (block_handles_.front().cachable_entry_).As(), + &block_iter_, s); + } else { + table_->NewDataBlockIterator( + read_options_, data_block_handle, &block_iter_, BlockType::kData, + /*get_context=*/nullptr, &lookup_context_, + block_prefetcher_.prefetch_buffer(), + /*for_compaction=*/is_for_compaction, /*async_read=*/false, s, + /*use_block_cache_for_lookup=*/false); + } } block_iter_points_to_real_block_ = true; CheckDataBlockWithinUpperBound(); @@ -638,99 +677,143 @@ void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { } } -void BlockBasedTableIterator::FindReadAheadSizeUpperBound() { - size_t total_bytes_till_upper_bound = 0; +void BlockBasedTableIterator::InitializeStartAndEndOffsets( + bool read_curr_block, bool& found_first_miss_block, + uint64_t& start_updated_offset, uint64_t& end_updated_offset, + size_t& prev_handles_size) { + prev_handles_size = block_handles_.size(); size_t footer = table_->get_rep()->footer.GetBlockTrailerSize(); - uint64_t start_offset = index_iter_->value().handle.offset(); - do { - BlockHandle block_handle = index_iter_->value().handle; - total_bytes_till_upper_bound += block_handle.size(); - total_bytes_till_upper_bound += footer; - - // Can't figure out for current block if current block - // is out of bound. But for next block we can find that. - // If curr block's index key >= iterate_upper_bound, it - // means all the keys in next block or above are out of - // bound. - if (IsNextBlockOutOfBound()) { - break; + // It initialize start and end offset to begin which is covered by following + // scenarios + if (read_curr_block) { + if (!DoesContainBlockHandles()) { + // Scenario 1 : read_curr_block (callback made on miss block which caller + // was reading) and it has no existing handles in queue. i.e. + // index_iter_ is pointing to block that is being read by + // caller. + // + // Add current block here as it doesn't need any lookup. + BlockHandleInfo block_handle_info; + block_handle_info.handle_ = index_iter_->value().handle; + block_handle_info.SetFirstInternalKey( + index_iter_->value().first_internal_key); + + end_updated_offset = block_handle_info.handle_.offset() + footer + + block_handle_info.handle_.size(); + block_handles_.emplace_back(std::move(block_handle_info)); + + index_iter_->Next(); + is_index_at_curr_block_ = false; + found_first_miss_block = true; + } else { + // Scenario 2 : read_curr_block (callback made on miss block which caller + // was reading) but the queue already has some handles. + // + // It can be due to reading error in second buffer in FilePrefetchBuffer. + // BlockHandles already added to the queue but there was error in fetching + // those data blocks. So in this call they need to be read again. + found_first_miss_block = true; + // Initialize prev_handles_size to 0 as all those handles need to be read + // again. + prev_handles_size = 0; + start_updated_offset = block_handles_.front().handle_.offset(); + end_updated_offset = block_handles_.back().handle_.offset() + footer + + block_handles_.back().handle_.size(); } - - // Since next block is not out of bound, iterate to that - // index block and add it's Data block size to - // readahead_size. - index_iter_->Next(); - - if (!index_iter_->Valid()) { - break; + } else { + // Scenario 3 : read_curr_block is false (callback made to do additional + // prefetching in buffers) and the queue already has some + // handles from first buffer. + if (DoesContainBlockHandles()) { + start_updated_offset = block_handles_.back().handle_.offset() + footer + + block_handles_.back().handle_.size(); + end_updated_offset = start_updated_offset; + } else { + // Scenario 4 : read_curr_block is false (callback made to do additional + // prefetching in buffers) but the queue has no handle + // from first buffer. + // + // It can be when Reseek is from block cache (which doesn't clear the + // buffers in FilePrefetchBuffer but clears block handles from queue) and + // reseek also lies within the buffer. So Next will get data from + // exisiting buffers untill this callback is made to prefetch additional + // data. All handles need to be added to the queue starting from + // index_iter_. + assert(index_iter_->Valid()); + start_updated_offset = index_iter_->value().handle.offset(); + end_updated_offset = start_updated_offset; } - - } while (true); - - block_prefetcher_.SetUpperBoundOffset(start_offset + - total_bytes_till_upper_bound); + } } +// BlockCacheLookupForReadAheadSize API lookups in the block cache and tries to +// reduce the start and end offset passed. +// +// Implementation - +// This function looks into the block cache for the blocks between start_offset +// and end_offset and add all the handles in the queue. +// It then iterates from the end to find first miss block and update the end +// offset to that block. +// It also iterates from the start and find first miss block and update the +// start offset to that block. +// +// Arguments - +// start_offset : Offset from which the caller wants to read. +// end_offset : End offset till which the caller wants to read. +// read_curr_block : True if this call was due to miss in the cache and +// caller wants to read that block. +// False if current call is to prefetch additional data in +// extra buffers. void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize( - uint64_t offset, size_t readahead_size, size_t& updated_readahead_size) { - updated_readahead_size = readahead_size; + bool read_curr_block, uint64_t& start_offset, uint64_t& end_offset) { + uint64_t start_updated_offset = start_offset; - // readahead_cache_lookup_ can be set false after Seek, if after Seek or Next + // readahead_cache_lookup_ can be set false, if after Seek and Next // there is SeekForPrev or any other backward operation. if (!readahead_cache_lookup_) { return; } - assert(!DoesContainBlockHandles()); - assert(index_iter_->value().handle.offset() == offset); - - // Error. current offset should be equal to what's requested for prefetching. - if (index_iter_->value().handle.offset() != offset) { - return; - } - - if (IsNextBlockOutOfBound()) { - updated_readahead_size = 0; + size_t footer = table_->get_rep()->footer.GetBlockTrailerSize(); + if (read_curr_block && !DoesContainBlockHandles() && + IsNextBlockOutOfBound()) { + end_offset = index_iter_->value().handle.offset() + footer + + index_iter_->value().handle.size(); return; } - size_t current_readahead_size = 0; - size_t footer = table_->get_rep()->footer.GetBlockTrailerSize(); - - // Add the current block to block_handles_. - { - BlockHandleInfo block_handle_info; - block_handle_info.handle_ = index_iter_->value().handle; - block_handle_info.SetFirstInternalKey( - index_iter_->value().first_internal_key); - block_handles_.emplace_back(std::move(block_handle_info)); - } + uint64_t end_updated_offset = start_updated_offset; + bool found_first_miss_block = false; + size_t prev_handles_size; - // Current block is included in length. Readahead should start from next - // block. - index_iter_->Next(); - is_index_at_curr_block_ = false; + // Initialize start and end offsets based on exisiting handles in the queue + // and read_curr_block argument passed. + InitializeStartAndEndOffsets(read_curr_block, found_first_miss_block, + start_updated_offset, end_updated_offset, + prev_handles_size); - while (index_iter_->Valid()) { + while (index_iter_->Valid() && !is_index_out_of_bound_) { BlockHandle block_handle = index_iter_->value().handle; - // Adding this data block exceeds passed down readahead_size. So this data + // Adding this data block exceeds end offset. So this data // block won't be added. - if (current_readahead_size + block_handle.size() + footer > - readahead_size) { + // There can be a case where passed end offset is smaller than + // block_handle.size() + footer because of readahead_size truncated to + // upper_bound. So we prefer to read the block rather than skip it to avoid + // sync read calls in case of async_io. + if (start_updated_offset != end_updated_offset && + (end_updated_offset + block_handle.size() + footer > end_offset)) { break; } - current_readahead_size += block_handle.size(); - current_readahead_size += footer; - // For current data block, do the lookup in the cache. Lookup should pin the - // data block and add the placeholder for cache. + // data block in cache. BlockHandleInfo block_handle_info; block_handle_info.handle_ = index_iter_->value().handle; block_handle_info.SetFirstInternalKey( index_iter_->value().first_internal_key); + end_updated_offset += footer + block_handle_info.handle_.size(); Status s = table_->LookupAndPinBlocksInCache( read_options_, block_handle, @@ -743,6 +826,12 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize( (block_handle_info.cachable_entry_.GetValue() || block_handle_info.cachable_entry_.GetCacheHandle()); + // If this is the first miss block, update start offset to this block. + if (!found_first_miss_block && !block_handle_info.is_cache_hit_) { + found_first_miss_block = true; + start_updated_offset = block_handle_info.handle_.offset(); + } + // Add the handle to the queue. block_handles_.emplace_back(std::move(block_handle_info)); @@ -756,16 +845,29 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize( break; } index_iter_->Next(); - }; + is_index_at_curr_block_ = false; + } + + if (found_first_miss_block) { + // Iterate cache hit block handles from the end till a Miss is there, to + // truncate and update the end offset till that Miss. + auto it = block_handles_.rbegin(); + auto it_end = + block_handles_.rbegin() + (block_handles_.size() - prev_handles_size); - // Iterate cache hit block handles from the end till a Miss is there, to - // update the readahead_size. - for (auto it = block_handles_.rbegin(); - it != block_handles_.rend() && (*it).is_cache_hit_ == true; ++it) { - current_readahead_size -= (*it).handle_.size(); - current_readahead_size -= footer; + while (it != it_end && (*it).is_cache_hit_ && + start_updated_offset != (*it).handle_.offset()) { + it++; + } + end_updated_offset = (*it).handle_.offset() + footer + (*it).handle_.size(); + } else { + // Nothing to read. Can be because of IOError in index_iter_->Next() or + // reached upper_bound. + end_updated_offset = start_updated_offset; } - updated_readahead_size = current_readahead_size; + + end_offset = end_updated_offset; + start_offset = start_updated_offset; ResetPreviousBlockOffset(); } diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h index 7ed7e3375a1..d9e29a75f3b 100644 --- a/table/block_based/block_based_table_iterator.h +++ b/table/block_based/block_based_table_iterator.h @@ -9,6 +9,7 @@ #pragma once #include +#include "db/seqno_to_time_mapping.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_based_table_reader_impl.h" #include "table/block_based/block_prefetcher.h" @@ -92,6 +93,28 @@ class BlockBasedTableIterator : public InternalIteratorBase { return const_cast(this) ->MaterializeCurrentBlock(); } + + uint64_t write_unix_time() const override { + assert(Valid()); + ParsedInternalKey pikey; + SequenceNumber seqno; + const SeqnoToTimeMapping& seqno_to_time_mapping = + table_->GetSeqnoToTimeMapping(); + Status s = ParseInternalKey(key(), &pikey, /*log_err_key=*/false); + if (!s.ok()) { + return std::numeric_limits::max(); + } else if (kUnknownSeqnoBeforeAll == pikey.sequence) { + return kUnknownTimeBeforeAll; + } else if (seqno_to_time_mapping.Empty()) { + return std::numeric_limits::max(); + } else if (kTypeValuePreferredSeqno == pikey.type) { + seqno = ParsePackedValueForSeqno(value()); + } else { + seqno = pikey.sequence; + } + return seqno_to_time_mapping.GetProximalTimeBeforeSeqno(seqno); + } + Slice value() const override { // PrepareValue() must have been called. assert(!is_at_first_key_from_index_); @@ -122,7 +145,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { } else if (block_iter_points_to_real_block_) { return block_iter_.status(); } else if (async_read_in_progress_) { - return Status::TryAgain(); + return Status::TryAgain("Async read in progress"); } else { return Status::OK(); } @@ -199,6 +222,10 @@ class BlockBasedTableIterator : public InternalIteratorBase { } } + FilePrefetchBuffer* prefetch_buffer() { + return block_prefetcher_.prefetch_buffer(); + } + std::unique_ptr> index_iter_; private: @@ -325,6 +352,8 @@ class BlockBasedTableIterator : public InternalIteratorBase { // is used to disable the lookup. IterDirection direction_ = IterDirection::kForward; + void SeekSecondPass(const Slice* target); + // If `target` is null, seek to first. void SeekImpl(const Slice* target, bool async_prefetch); @@ -365,12 +394,12 @@ class BlockBasedTableIterator : public InternalIteratorBase { } // *** BEGIN APIs relevant to auto tuning of readahead_size *** - void FindReadAheadSizeUpperBound(); - // This API is called to lookup the data blocks ahead in the cache to estimate - // the current readahead_size. - void BlockCacheLookupForReadAheadSize(uint64_t offset, size_t readahead_size, - size_t& updated_readahead_size); + // This API is called to lookup the data blocks ahead in the cache to tune + // the start and end offsets passed. + void BlockCacheLookupForReadAheadSize(bool read_curr_block, + uint64_t& start_offset, + uint64_t& end_offset); void ResetBlockCacheLookupVar() { is_index_out_of_bound_ = false; @@ -399,6 +428,11 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool DoesContainBlockHandles() { return !block_handles_.empty(); } + void InitializeStartAndEndOffsets(bool read_curr_block, + bool& found_first_miss_block, + uint64_t& start_updated_offset, + uint64_t& end_updated_offset, + size_t& prev_handles_size); // *** END APIs relevant to auto tuning of readahead_size *** }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 678cdf469e6..a12aa0d7fd1 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -217,6 +217,7 @@ void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type, Statistics* const statistics = rep_->ioptions.stats; PERF_COUNTER_ADD(block_cache_hit_count, 1); + PERF_COUNTER_ADD(block_cache_read_byte, usage); PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, static_cast(rep_->level)); @@ -232,6 +233,7 @@ void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type, case BlockType::kFilter: case BlockType::kFilterPartitionIndex: PERF_COUNTER_ADD(block_cache_filter_hit_count, 1); + PERF_COUNTER_ADD(block_cache_filter_read_byte, usage); if (get_context) { ++get_context->get_context_stats_.num_cache_filter_hit; @@ -242,6 +244,7 @@ void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type, case BlockType::kCompressionDictionary: // TODO: introduce perf counter for compression dictionary hit count + PERF_COUNTER_ADD(block_cache_compression_dict_read_byte, usage); if (get_context) { ++get_context->get_context_stats_.num_cache_compression_dict_hit; } else { @@ -251,6 +254,7 @@ void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type, case BlockType::kIndex: PERF_COUNTER_ADD(block_cache_index_hit_count, 1); + PERF_COUNTER_ADD(block_cache_index_read_byte, usage); if (get_context) { ++get_context->get_context_stats_.num_cache_index_hit; @@ -601,7 +605,7 @@ Status BlockBasedTable::Open( const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; const bool preload_all = !table_options.cache_index_and_filter_blocks; - if (!ioptions.allow_mmap_reads) { + if (!ioptions.allow_mmap_reads && !env_options.use_mmap_reads) { s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch, tail_prefetch_stats, prefetch_all, preload_all, &prefetch_buffer, ioptions.stats, tail_size, @@ -613,8 +617,7 @@ Status BlockBasedTable::Open( } else { // Should not prefetch for mmap mode. prefetch_buffer.reset(new FilePrefetchBuffer( - 0 /* readahead_size */, 0 /* max_readahead_size */, false /* enable */, - true /* track_min_offset */)); + ReadaheadParams(), false /* enable */, true /* track_min_offset */)); } // Read in the following order: @@ -632,6 +635,19 @@ Status BlockBasedTable::Open( prefetch_buffer.get(), file_size, &footer, kBlockBasedTableMagicNumber); } + // If the footer is corrupted and the FS supports checksum verification and + // correction, try reading the footer again + if (s.IsCorruption()) { + RecordTick(ioptions.statistics.get(), SST_FOOTER_CORRUPTION_COUNT); + if (CheckFSFeatureSupport(ioptions.fs.get(), + FSSupportedOps::kVerifyAndReconstructRead)) { + IOOptions retry_opts = opts; + retry_opts.verify_and_reconstruct_read = true; + s = ReadFooterFromFile(retry_opts, file.get(), *ioptions.fs, + prefetch_buffer.get(), file_size, &footer, + kBlockBasedTableMagicNumber); + } + } if (!s.ok()) { return s; } @@ -774,7 +790,6 @@ Status BlockBasedTable::Open( PersistentCacheOptions(rep->table_options.persistent_cache, rep->base_cache_key, rep->ioptions.stats); - // TODO(yuzhangyu): handle range deletion entries for UDT in memtable only. s = new_table->ReadRangeDelBlock(ro, prefetch_buffer.get(), metaindex_iter.get(), internal_comparator, &lookup_context); @@ -876,18 +891,15 @@ Status BlockBasedTable::PrefetchTail( if (s.ok() && !file->use_direct_io() && !force_direct_prefetch) { if (!file->Prefetch(opts, prefetch_off, prefetch_len).IsNotSupported()) { prefetch_buffer->reset(new FilePrefetchBuffer( - 0 /* readahead_size */, 0 /* max_readahead_size */, - false /* enable */, true /* track_min_offset */)); + ReadaheadParams(), false /* enable */, true /* track_min_offset */)); return Status::OK(); } } // Use `FilePrefetchBuffer` prefetch_buffer->reset(new FilePrefetchBuffer( - 0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */, - true /* track_min_offset */, false /* implicit_auto_readahead */, - 0 /* num_file_reads */, 0 /* num_file_reads_for_auto_readahead */, - 0 /* upper_bound_offset */, nullptr /* fs */, nullptr /* clock */, stats, + ReadaheadParams(), true /* enable */, true /* track_min_offset */, + nullptr /* fs */, nullptr /* clock */, stats, /* readahead_cb */ nullptr, FilePrefetchBufferUsage::kTableOpenPrefetchTail)); @@ -926,6 +938,17 @@ Status BlockBasedTable::ReadPropertiesBlock( } else { assert(table_properties != nullptr); rep_->table_properties = std::move(table_properties); + + if (s.ok()) { + s = rep_->seqno_to_time_mapping.DecodeFrom( + rep_->table_properties->seqno_to_time_mapping); + } + if (!s.ok()) { + ROCKS_LOG_WARN( + rep_->ioptions.logger, + "Problem reading or processing seqno-to-time mapping: %s", + s.ToString().c_str()); + } rep_->blocks_maybe_compressed = rep_->table_properties->compression_name != CompressionTypeToString(kNoCompression); @@ -1010,9 +1033,16 @@ Status BlockBasedTable::ReadRangeDelBlock( s.ToString().c_str()); IGNORE_STATUS_IF_ERROR(s); } else { + std::vector snapshots; + // When user defined timestamps are not persisted, the range tombstone end + // key read from the data block doesn't include user timestamp. + // The range tombstone start key should already include user timestamp as + // it's handled at block parsing level in the same way as the other data + // blocks. rep_->fragmented_range_dels = - std::make_shared(std::move(iter), - internal_comparator); + std::make_shared( + std::move(iter), internal_comparator, false /*for_compaction=*/, + snapshots, rep_->user_defined_timestamps_persisted); } } return s; @@ -1231,6 +1261,10 @@ std::shared_ptr BlockBasedTable::GetTableProperties() return rep_->table_properties; } +const SeqnoToTimeMapping& BlockBasedTable::GetSeqnoToTimeMapping() const { + return rep_->seqno_to_time_mapping; +} + size_t BlockBasedTable::ApproximateMemoryUsage() const { size_t usage = 0; if (rep_) { @@ -2305,6 +2339,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, biter.key(), &parsed_key, false /* log_err_key */); // TODO if (!pik_status.ok()) { s = pik_status; + break; } if (GetThreadLogging()) { @@ -2316,9 +2351,15 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, int(parsed_key.type)); } - if (!get_context->SaveValue( - parsed_key, biter.value(), &matched, - biter.IsValuePinned() ? &biter : nullptr)) { + Status read_status; + bool ret = get_context->SaveValue( + parsed_key, biter.value(), &matched, &read_status, + biter.IsValuePinned() ? &biter : nullptr); + if (!read_status.ok()) { + s = read_status; + break; + } + if (!ret) { if (get_context->State() == GetContext::GetState::kFound) { does_referenced_key_exist = true; referenced_data_size = biter.key().size() + biter.value().size(); @@ -2327,7 +2368,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, break; } } - s = biter.status(); + if (s.ok()) { + s = biter.status(); + } if (!s.ok()) { break; } @@ -2508,10 +2551,11 @@ Status BlockBasedTable::VerifyChecksumInBlocks( : rep_->table_options.max_auto_readahead_size; // FilePrefetchBuffer doesn't work in mmap mode and readahead is not // needed there. + ReadaheadParams readahead_params; + readahead_params.initial_readahead_size = readahead_size; + readahead_params.max_readahead_size = readahead_size; FilePrefetchBuffer prefetch_buffer( - readahead_size /* readahead_size */, - readahead_size /* max_readahead_size */, - !rep_->ioptions.allow_mmap_reads /* enable */); + readahead_params, !rep_->ioptions.allow_mmap_reads /* enable */); for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { s = index_iter->status(); @@ -2660,6 +2704,17 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, return TEST_BlockInCache(iiter->value().handle); } +void BlockBasedTable::TEST_GetDataBlockHandle(const ReadOptions& options, + const Slice& key, + BlockHandle& handle) { + std::unique_ptr> iiter(NewIndexIterator( + options, /*disable_prefix_seek=*/false, /*input_iter=*/nullptr, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); + iiter->Seek(key); + assert(iiter->Valid()); + handle = iiter->value().handle; +} + // REQUIRES: The following fields of rep_ should have already been populated: // 1. file // 2. index_handle, @@ -2920,7 +2975,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { "--------------------------------------\n"; std::unique_ptr metaindex; std::unique_ptr metaindex_iter; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions ro; Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex, &metaindex_iter); @@ -3025,7 +3080,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { out_stream << "Index Details:\n" "--------------------------------------\n"; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; std::unique_ptr> blockhandles_iter( NewIndexIterator(read_options, /*need_upper_bound_check=*/false, @@ -3062,7 +3117,7 @@ Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { << " size " << blockhandles_iter->value().handle.size() << "\n"; std::string str_key = user_key.ToString(); - std::string res_key(""); + std::string res_key; char cspace = ' '; for (size_t i = 0; i < str_key.size(); i++) { res_key.append(&str_key[i], 1); @@ -3076,7 +3131,7 @@ Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { } Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; std::unique_ptr> blockhandles_iter( NewIndexIterator(read_options, /*need_upper_bound_check=*/false, @@ -3163,7 +3218,7 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, std::string str_key = ikey.user_key().ToString(); std::string str_value = value.ToString(); - std::string res_key(""), res_value(""); + std::string res_key, res_value; char cspace = ' '; for (size_t i = 0; i < str_key.size(); i++) { if (str_key[i] == '\0') { diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 22361b505d4..a98d7c78bee 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -16,6 +16,7 @@ #include "cache/cache_key.h" #include "cache/cache_reservation_manager.h" #include "db/range_tombstone_fragmenter.h" +#include "db/seqno_to_time_mapping.h" #include "file/filename.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table_properties.h" @@ -188,12 +189,17 @@ class BlockBasedTable : public TableReader { // REQUIRES: key is in this table && block cache enabled bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); + void TEST_GetDataBlockHandle(const ReadOptions& options, const Slice& key, + BlockHandle& handle); + // Set up the table for Compaction. Might change some parameters with // posix_fadvise void SetupForCompaction() override; std::shared_ptr GetTableProperties() const override; + const SeqnoToTimeMapping& GetSeqnoToTimeMapping() const; + size_t ApproximateMemoryUsage() const override; // convert SST file to a human readable form @@ -604,6 +610,7 @@ struct BlockBasedTable::Rep { BlockHandle compression_dict_handle; std::shared_ptr table_properties; + SeqnoToTimeMapping seqno_to_time_mapping; BlockHandle index_handle; BlockBasedTableOptions::IndexType index_type; bool whole_key_filtering; @@ -693,33 +700,23 @@ struct BlockBasedTable::Rep { return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX; } void CreateFilePrefetchBuffer( - size_t readahead_size, size_t max_readahead_size, - std::unique_ptr* fpb, bool implicit_auto_readahead, - uint64_t num_file_reads, uint64_t num_file_reads_for_auto_readahead, - uint64_t upper_bound_offset, - const std::function& readaheadsize_cb, + const ReadaheadParams& readahead_params, + std::unique_ptr* fpb, + const std::function& readaheadsize_cb, FilePrefetchBufferUsage usage) const { fpb->reset(new FilePrefetchBuffer( - readahead_size, max_readahead_size, - !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset */, - implicit_auto_readahead, num_file_reads, - num_file_reads_for_auto_readahead, upper_bound_offset, - ioptions.fs.get(), ioptions.clock, ioptions.stats, readaheadsize_cb, - usage)); + readahead_params, !ioptions.allow_mmap_reads /* enable */, + false /* track_min_offset */, ioptions.fs.get(), ioptions.clock, + ioptions.stats, readaheadsize_cb, usage)); } void CreateFilePrefetchBufferIfNotExists( - size_t readahead_size, size_t max_readahead_size, - std::unique_ptr* fpb, bool implicit_auto_readahead, - uint64_t num_file_reads, uint64_t num_file_reads_for_auto_readahead, - uint64_t upper_bound_offset, - const std::function& readaheadsize_cb, + const ReadaheadParams& readahead_params, + std::unique_ptr* fpb, + const std::function& readaheadsize_cb, FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown) const { if (!(*fpb)) { - CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb, - implicit_auto_readahead, num_file_reads, - num_file_reads_for_auto_readahead, - upper_bound_offset, readaheadsize_cb, usage); + CreateFilePrefetchBuffer(readahead_params, fpb, readaheadsize_cb, usage); } } diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h index e7621909cc7..30d9884ec65 100644 --- a/table/block_based/block_based_table_reader_sync_and_async.h +++ b/table/block_based/block_based_table_reader_sync_and_async.h @@ -192,10 +192,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) BlockContents serialized_block; if (s.ok()) { - if (use_fs_scratch) { - serialized_block = - BlockContents(Slice(req.result.data() + req_offset, handle.size())); - } else if (!use_shared_buffer) { + if (!use_fs_scratch && !use_shared_buffer) { // We allocated a buffer for this block. Give ownership of it to // BlockContents so it can free the memory assert(req.result.data() == req.scratch); @@ -206,7 +203,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) } else { // We used the scratch buffer or direct io buffer // which are shared by the blocks. - // serialized_block does not have the ownership. + // In case of use_fs_scratch, underlying file system provided buffer is + // used. serialized_block does not have the ownership. serialized_block = BlockContents(Slice(req.result.data() + req_offset, handle.size())); } @@ -216,15 +214,42 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) if (options.verify_checksums) { PERF_TIMER_GUARD(block_checksum_time); - const char* data = req.result.data(); + const char* data = serialized_block.data.data(); // Since the scratch might be shared, the offset of the data block in // the buffer might not be 0. req.result.data() only point to the // begin address of each read request, we need to add the offset // in each read request. Checksum is stored in the block trailer, // beyond the payload size. - s = VerifyBlockChecksum(footer, data + req_offset, handle.size(), + s = VerifyBlockChecksum(footer, data, handle.size(), rep_->file->file_name(), handle.offset()); + RecordTick(ioptions.stats, BLOCK_CHECKSUM_COMPUTE_COUNT); TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s); + if (!s.ok() && + CheckFSFeatureSupport(ioptions.fs.get(), + FSSupportedOps::kVerifyAndReconstructRead)) { + assert(s.IsCorruption()); + assert(!ioptions.allow_mmap_reads); + RecordTick(ioptions.stats, BLOCK_CHECKSUM_MISMATCH_COUNT); + + // Repeat the read for this particular block using the regular + // synchronous Read API. We can use the same chunk of memory + // pointed to by data, since the size is identical and we know + // its not a memory mapped file + Slice result; + IOOptions opts; + IOStatus io_s = file->PrepareIOOptions(options, opts); + opts.verify_and_reconstruct_read = true; + io_s = file->Read(opts, handle.offset(), BlockSizeWithTrailer(handle), + &result, const_cast(data), nullptr); + if (io_s.ok()) { + assert(result.data() == data); + assert(result.size() == BlockSizeWithTrailer(handle)); + s = VerifyBlockChecksum(footer, data, handle.size(), + rep_->file->file_name(), handle.offset()); + } else { + s = io_s; + } + } } } else if (!use_shared_buffer) { // Free the allocated scratch buffer. @@ -704,15 +729,23 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) } // Call the *saver function on each entry/block until it returns false - for (; biter->Valid(); biter->Next()) { + for (; biter->status().ok() && biter->Valid(); biter->Next()) { ParsedInternalKey parsed_key; Status pik_status = ParseInternalKey( biter->key(), &parsed_key, false /* log_err_key */); // TODO if (!pik_status.ok()) { s = pik_status; + break; } - if (!get_context->SaveValue(parsed_key, biter->value(), &matched, - value_pinner)) { + Status read_status; + bool ret = get_context->SaveValue( + parsed_key, biter->value(), &matched, &read_status, + value_pinner ? value_pinner : nullptr); + if (!read_status.ok()) { + s = read_status; + break; + } + if (!ret) { if (get_context->State() == GetContext::GetState::kFound) { does_referenced_key_exist = true; referenced_data_size = @@ -721,7 +754,6 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) done = true; break; } - s = biter->status(); } // Write the block cache access. // XXX: There appear to be 'break' statements above that bypass this @@ -744,8 +776,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) *lookup_data_block_context, lookup_data_block_context->block_key, referenced_key, does_referenced_key_exist, referenced_data_size); } - s = biter->status(); - if (done) { + if (s.ok()) { + s = biter->status(); + } + if (done || !s.ok()) { // Avoid the extra Next which is expensive in two-level indexes break; } diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc index 254546893f3..d90ee157821 100644 --- a/table/block_based/block_based_table_reader_test.cc +++ b/table/block_based/block_based_table_reader_test.cc @@ -19,6 +19,7 @@ #include "rocksdb/compression_type.h" #include "rocksdb/db.h" #include "rocksdb/file_system.h" +#include "rocksdb/options.h" #include "table/block_based/block_based_table_builder.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/partitioned_index_iterator.h" @@ -132,12 +133,14 @@ class BlockBasedTableReaderBaseTest : public testing::Test { // as each block's size. compression_opts.max_dict_bytes = compression_dict_bytes; compression_opts.max_dict_buffer_bytes = compression_dict_bytes; - IntTblPropCollectorFactories factories; + InternalTblPropCollFactories factories; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr table_builder( options_.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, comparator, &factories, - compression_type, compression_opts, - 0 /* column_family_id */, + TableBuilderOptions(ioptions, moptions, read_options, write_options, + comparator, &factories, compression_type, + compression_opts, 0 /* column_family_id */, kDefaultColumnFamilyName, -1 /* level */), writer.get())); @@ -159,7 +162,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test { bool user_defined_timestamps_persisted = true) { const MutableCFOptions moptions(options_); TableReaderOptions table_reader_options = TableReaderOptions( - ioptions, moptions.prefix_extractor, EnvOptions(), comparator, + ioptions, moptions.prefix_extractor, foptions, comparator, 0 /* block_protection_bytes_per_key */, false /* _skip_filters */, false /* _immortal */, false /* _force_direct_prefetch */, -1 /* _level */, nullptr /* _block_cache_tracer */, @@ -181,7 +184,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test { &general_table, prefetch_index_and_filter_in_cache); if (s.ok()) { - table->reset(reinterpret_cast(general_table.release())); + table->reset(static_cast(general_table.release())); } if (status) { diff --git a/table/block_based/block_builder.cc b/table/block_based/block_builder.cc index 877df81c129..e4950e4356b 100644 --- a/table/block_based/block_builder.cc +++ b/table/block_based/block_builder.cc @@ -33,9 +33,8 @@ #include "table/block_based/block_builder.h" -#include - #include +#include #include "db/dbformat.h" #include "rocksdb/comparator.h" diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc index 54848b785ba..ec8a91868ce 100644 --- a/table/block_based/block_prefetcher.cc +++ b/table/block_based/block_prefetcher.cc @@ -16,7 +16,13 @@ void BlockPrefetcher::PrefetchIfNeeded( const BlockBasedTable::Rep* rep, const BlockHandle& handle, const size_t readahead_size, bool is_for_compaction, const bool no_sequential_checking, const ReadOptions& read_options, - const std::function& readaheadsize_cb) { + const std::function& readaheadsize_cb, + bool is_async_io_prefetch) { + ReadaheadParams readahead_params; + readahead_params.initial_readahead_size = readahead_size; + readahead_params.max_readahead_size = readahead_size; + readahead_params.num_buffers = is_async_io_prefetch ? 2 : 1; + const size_t len = BlockBasedTable::BlockSizeWithTrailer(handle); const size_t offset = handle.offset(); if (is_for_compaction) { @@ -44,21 +50,18 @@ void BlockPrefetcher::PrefetchIfNeeded( // // num_file_reads is used by FilePrefetchBuffer only when // implicit_auto_readahead is set. - rep->CreateFilePrefetchBufferIfNotExists( - compaction_readahead_size_, compaction_readahead_size_, - &prefetch_buffer_, /*implicit_auto_readahead=*/false, - /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, - /*upper_bound_offset=*/0, /*readaheadsize_cb=*/nullptr); + readahead_params.initial_readahead_size = compaction_readahead_size_; + readahead_params.max_readahead_size = compaction_readahead_size_; + rep->CreateFilePrefetchBufferIfNotExists(readahead_params, + &prefetch_buffer_, + /*readaheadsize_cb=*/nullptr); return; } // Explicit user requested readahead. if (readahead_size > 0) { rep->CreateFilePrefetchBufferIfNotExists( - readahead_size, readahead_size, &prefetch_buffer_, - /*implicit_auto_readahead=*/false, /*num_file_reads=*/0, - /*num_file_reads_for_auto_readahead=*/0, upper_bound_offset_, - readaheadsize_cb, + readahead_params, &prefetch_buffer_, readaheadsize_cb, /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); return; } @@ -76,15 +79,17 @@ void BlockPrefetcher::PrefetchIfNeeded( initial_auto_readahead_size_ = max_auto_readahead_size; } + readahead_params.initial_readahead_size = initial_auto_readahead_size_; + readahead_params.max_readahead_size = max_auto_readahead_size; + readahead_params.implicit_auto_readahead = true; + readahead_params.num_file_reads_for_auto_readahead = + rep->table_options.num_file_reads_for_auto_readahead; + // In case of no_sequential_checking, it will skip the num_file_reads_ and // will always creates the FilePrefetchBuffer. if (no_sequential_checking) { rep->CreateFilePrefetchBufferIfNotExists( - initial_auto_readahead_size_, max_auto_readahead_size, - &prefetch_buffer_, /*implicit_auto_readahead=*/true, - /*num_file_reads=*/0, - rep->table_options.num_file_reads_for_auto_readahead, - upper_bound_offset_, readaheadsize_cb, + readahead_params, &prefetch_buffer_, readaheadsize_cb, /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); return; } @@ -111,12 +116,10 @@ void BlockPrefetcher::PrefetchIfNeeded( return; } + readahead_params.num_file_reads = num_file_reads_; if (rep->file->use_direct_io()) { rep->CreateFilePrefetchBufferIfNotExists( - initial_auto_readahead_size_, max_auto_readahead_size, - &prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_, - rep->table_options.num_file_reads_for_auto_readahead, - upper_bound_offset_, readaheadsize_cb, + readahead_params, &prefetch_buffer_, readaheadsize_cb, /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); return; } @@ -136,10 +139,7 @@ void BlockPrefetcher::PrefetchIfNeeded( BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_); if (s.IsNotSupported()) { rep->CreateFilePrefetchBufferIfNotExists( - initial_auto_readahead_size_, max_auto_readahead_size, - &prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_, - rep->table_options.num_file_reads_for_auto_readahead, - upper_bound_offset_, readaheadsize_cb, + readahead_params, &prefetch_buffer_, readaheadsize_cb, /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); return; } diff --git a/table/block_based/block_prefetcher.h b/table/block_based/block_prefetcher.h index 7e075c08e2d..3432e0e8ad6 100644 --- a/table/block_based/block_prefetcher.h +++ b/table/block_based/block_prefetcher.h @@ -22,7 +22,8 @@ class BlockPrefetcher { const BlockBasedTable::Rep* rep, const BlockHandle& handle, size_t readahead_size, bool is_for_compaction, const bool no_sequential_checking, const ReadOptions& read_options, - const std::function& readaheadsize_cb); + const std::function& readaheadsize_cb, + bool is_async_io_prefetch); FilePrefetchBuffer* prefetch_buffer() { return prefetch_buffer_.get(); } void UpdateReadPattern(const uint64_t& offset, const size_t& len) { @@ -53,15 +54,6 @@ class BlockPrefetcher { &initial_auto_readahead_size_); } - void SetUpperBoundOffset(uint64_t upper_bound_offset) { - upper_bound_offset_ = upper_bound_offset; - if (prefetch_buffer() != nullptr) { - // Upper bound can be changed on reseek. So update that in - // FilePrefetchBuffer. - prefetch_buffer()->ResetUpperBoundOffset(upper_bound_offset); - } - } - private: // Readahead size used in compaction, its value is used only if // lookup_context_.caller = kCompaction. @@ -78,7 +70,5 @@ class BlockPrefetcher { uint64_t prev_offset_ = 0; size_t prev_len_ = 0; std::unique_ptr prefetch_buffer_; - - uint64_t upper_bound_offset_ = 0; }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc index 9082a08e9f8..b1a855263da 100644 --- a/table/block_based/block_test.cc +++ b/table/block_based/block_test.cc @@ -6,9 +6,8 @@ #include "table/block_based/block.h" -#include - #include +#include #include #include #include diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index 2841b271dea..d7bee167510 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -551,12 +551,14 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, file_writer.reset( new WritableFileWriter(std::move(f), "" /* don't care */, FileOptions())); std::unique_ptr builder; - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; std::string column_family_name; + const ReadOptions read_options; + const WriteOptions write_options; builder.reset(ioptions.table_factory->NewTableBuilder( TableBuilderOptions( - ioptions, moptions, internal_comparator, - &int_tbl_prop_collector_factories, options.compression, + ioptions, moptions, read_options, write_options, internal_comparator, + &internal_tbl_prop_coll_factories, options.compression, CompressionOptions(), TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, column_family_name, level_), @@ -567,7 +569,7 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, EXPECT_TRUE(builder->status().ok()); Status s = builder->Finish(); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Flush(IOOptions())); EXPECT_TRUE(s.ok()) << s.ToString(); EXPECT_EQ(sink->contents().size(), builder->FileSize()); diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index 19b880a900a..f3c3fb256b2 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -72,9 +72,9 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { detect_filter_construct_corruption_( detect_filter_construct_corruption) {} - ~XXPH3FilterBitsBuilder() override {} + ~XXPH3FilterBitsBuilder() override = default; - virtual void AddKey(const Slice& key) override { + void AddKey(const Slice& key) override { uint64_t hash = GetSliceHash64(key); // Especially with prefixes, it is common to have repetition, // though only adjacent repetition, which we want to immediately @@ -100,11 +100,11 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { } } - virtual size_t EstimateEntriesAdded() override { + size_t EstimateEntriesAdded() override { return hash_entries_info_.entries.size(); } - virtual Status MaybePostVerify(const Slice& filter_content) override; + Status MaybePostVerify(const Slice& filter_content) override; protected: static constexpr uint32_t kMetadataLen = 5; @@ -321,16 +321,15 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete; void operator=(const FastLocalBloomBitsBuilder&) = delete; - ~FastLocalBloomBitsBuilder() override {} + ~FastLocalBloomBitsBuilder() override = default; using FilterBitsBuilder::Finish; - virtual Slice Finish(std::unique_ptr* buf) override { + Slice Finish(std::unique_ptr* buf) override { return Finish(buf, nullptr); } - virtual Slice Finish(std::unique_ptr* buf, - Status* status) override { + Slice Finish(std::unique_ptr* buf, Status* status) override { size_t num_entries = hash_entries_info_.entries.size(); size_t len_with_metadata = CalculateSpace(num_entries); @@ -525,7 +524,7 @@ class FastLocalBloomBitsReader : public BuiltinFilterBitsReader { FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete; void operator=(const FastLocalBloomBitsReader&) = delete; - ~FastLocalBloomBitsReader() override {} + ~FastLocalBloomBitsReader() override = default; bool MayMatch(const Slice& key) override { uint64_t h = GetSliceHash64(key); @@ -536,7 +535,7 @@ class FastLocalBloomBitsReader : public BuiltinFilterBitsReader { data_ + byte_offset); } - virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { + void MayMatch(int num_keys, Slice** keys, bool* may_match) override { std::array hashes; std::array byte_offsets; for (int i = 0; i < num_keys; ++i) { @@ -606,16 +605,15 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { Standard128RibbonBitsBuilder(const Standard128RibbonBitsBuilder&) = delete; void operator=(const Standard128RibbonBitsBuilder&) = delete; - ~Standard128RibbonBitsBuilder() override {} + ~Standard128RibbonBitsBuilder() override = default; using FilterBitsBuilder::Finish; - virtual Slice Finish(std::unique_ptr* buf) override { + Slice Finish(std::unique_ptr* buf) override { return Finish(buf, nullptr); } - virtual Slice Finish(std::unique_ptr* buf, - Status* status) override { + Slice Finish(std::unique_ptr* buf, Status* status) override { if (hash_entries_info_.entries.size() > kMaxRibbonEntries) { ROCKS_LOG_WARN( info_log_, "Too many keys for Ribbon filter: %llu", @@ -967,14 +965,14 @@ class Standard128RibbonBitsReader : public BuiltinFilterBitsReader { Standard128RibbonBitsReader(const Standard128RibbonBitsReader&) = delete; void operator=(const Standard128RibbonBitsReader&) = delete; - ~Standard128RibbonBitsReader() override {} + ~Standard128RibbonBitsReader() override = default; bool MayMatch(const Slice& key) override { uint64_t h = GetSliceHash64(key); return soln_.FilterQuery(h, hasher_); } - virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { + void MayMatch(int num_keys, Slice** keys, bool* may_match) override { struct SavedData { uint64_t seeded_hash; uint32_t segment_num; @@ -1020,9 +1018,7 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { void AddKey(const Slice& key) override; - virtual size_t EstimateEntriesAdded() override { - return hash_entries_.size(); - } + size_t EstimateEntriesAdded() override { return hash_entries_.size(); } using FilterBitsBuilder::Finish; @@ -1070,7 +1066,7 @@ LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key, assert(bits_per_key_); } -LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {} +LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() = default; void LegacyBloomBitsBuilder::AddKey(const Slice& key) { uint32_t hash = BloomHash(key); @@ -1220,7 +1216,7 @@ class LegacyBloomBitsReader : public BuiltinFilterBitsReader { LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete; void operator=(const LegacyBloomBitsReader&) = delete; - ~LegacyBloomBitsReader() override {} + ~LegacyBloomBitsReader() override = default; // "contents" contains the data built by a preceding call to // FilterBitsBuilder::Finish. MayMatch must return true if the key was @@ -1236,7 +1232,7 @@ class LegacyBloomBitsReader : public BuiltinFilterBitsReader { hash, num_probes_, data_ + byte_offset, log2_cache_line_size_); } - virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { + void MayMatch(int num_keys, Slice** keys, bool* may_match) override { std::array hashes; std::array byte_offsets; for (int i = 0; i < num_keys; ++i) { @@ -1359,7 +1355,7 @@ BloomLikeFilterPolicy::BloomLikeFilterPolicy(double bits_per_key) whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000; } -BloomLikeFilterPolicy::~BloomLikeFilterPolicy() {} +BloomLikeFilterPolicy::~BloomLikeFilterPolicy() = default; const char* BloomLikeFilterPolicy::kClassName() { return "rocksdb.internal.BloomLikeFilter"; } @@ -1805,7 +1801,7 @@ FilterBuildingContext::FilterBuildingContext( const BlockBasedTableOptions& _table_options) : table_options(_table_options) {} -FilterPolicy::~FilterPolicy() {} +FilterPolicy::~FilterPolicy() = default; std::shared_ptr BloomLikeFilterPolicy::Create( const std::string& name, double bits_per_key) { diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index 60ff7c44f39..0d7d9a59968 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -259,7 +259,7 @@ void FullFilterBlockReader::MayMatch(MultiGetRange* range, bool no_io, } } - filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]); + filter_bits_reader->MayMatch(num_keys, keys.data(), may_match.data()); int i = 0; for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) { diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index 7b0890d10c5..5bffd53520f 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -49,19 +49,16 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { // directly. and be deleted here ~FullFilterBlockBuilder() {} - virtual void Add(const Slice& key_without_ts) override; - virtual bool IsEmpty() const override { return !any_added_; } - virtual size_t EstimateEntriesAdded() override; - virtual Slice Finish( - const BlockHandle& tmp, Status* status, - std::unique_ptr* filter_data = nullptr) override; + void Add(const Slice& key_without_ts) override; + bool IsEmpty() const override { return !any_added_; } + size_t EstimateEntriesAdded() override; + Slice Finish(const BlockHandle& tmp, Status* status, + std::unique_ptr* filter_data = nullptr) override; using FilterBlockBuilder::Finish; - virtual void ResetFilterBitsBuilder() override { - filter_bits_builder_.reset(); - } + void ResetFilterBitsBuilder() override { filter_bits_builder_.reset(); } - virtual Status MaybePostVerifyFilter(const Slice& filter_content) override { + Status MaybePostVerifyFilter(const Slice& filter_content) override { return filter_bits_builder_->MaybePostVerify(filter_content); } diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index 0268b7b2715..154c8c0907f 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -23,7 +23,7 @@ namespace ROCKSDB_NAMESPACE { class TestFilterBitsBuilder : public FilterBitsBuilder { public: - explicit TestFilterBitsBuilder() {} + explicit TestFilterBitsBuilder() = default; // Add Key to filter void AddKey(const Slice& key) override { @@ -197,7 +197,7 @@ class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder { public: explicit CountUniqueFilterBitsBuilderWrapper(FilterBitsBuilder* b) : b_(b) {} - ~CountUniqueFilterBitsBuilderWrapper() override {} + ~CountUniqueFilterBitsBuilderWrapper() override = default; void AddKey(const Slice& key) override { b_->AddKey(key); diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc index a9e02a28727..98d084b344b 100644 --- a/table/block_based/index_builder.cc +++ b/table/block_based/index_builder.cc @@ -9,8 +9,7 @@ #include "table/block_based/index_builder.h" -#include - +#include #include #include #include diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index c908db41d35..36398e8e2ef 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -495,11 +495,9 @@ Status PartitionedFilterBlockReader::CacheDependencies( std::unique_ptr prefetch_buffer; if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled() || tail_prefetch_buffer->GetPrefetchOffset() > prefetch_off) { - rep->CreateFilePrefetchBuffer( - 0, 0, &prefetch_buffer, false /* Implicit autoreadahead */, - 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/, - /*upper_bound_offset*/ 0, /*readaheadsize_cb*/ nullptr, - /*usage=*/FilePrefetchBufferUsage::kUnknown); + rep->CreateFilePrefetchBuffer(ReadaheadParams(), &prefetch_buffer, + /*readaheadsize_cb*/ nullptr, + /*usage=*/FilePrefetchBufferUsage::kUnknown); IOOptions opts; s = rep->file->PrepareIOOptions(ro, opts); diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 817fe94245a..35a440712a3 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -40,11 +40,10 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { void Add(const Slice& key) override; size_t EstimateEntriesAdded() override; - virtual Slice Finish( - const BlockHandle& last_partition_block_handle, Status* status, - std::unique_ptr* filter_data = nullptr) override; + Slice Finish(const BlockHandle& last_partition_block_handle, Status* status, + std::unique_ptr* filter_data = nullptr) override; - virtual void ResetFilterBitsBuilder() override { + void ResetFilterBitsBuilder() override { // Previously constructed partitioned filters by // this to-be-reset FiterBitsBuilder can also be // cleared @@ -56,8 +55,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { // as part of PartitionFilterBlockBuilder::Finish // to avoid implementation complexity of doing it elsewhere. // Therefore we are skipping it in here. - virtual Status MaybePostVerifyFilter( - const Slice& /* filter_content */) override { + Status MaybePostVerifyFilter(const Slice& /* filter_content */) override { return Status::OK(); } diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index 1d6e2fced84..50bb77975b9 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -87,7 +87,7 @@ class PartitionedFilterBlockTest table_options_.index_block_restart_interval = 3; } - ~PartitionedFilterBlockTest() override {} + ~PartitionedFilterBlockTest() override = default; static constexpr int kKeyNum = 4; static constexpr int kMissingKeyNum = 2; @@ -200,7 +200,7 @@ class PartitionedFilterBlockTest // Querying added keys const bool no_io = true; std::vector keys = PrepareKeys(keys_without_ts, kKeyNum); - for (auto key : keys) { + for (const auto& key : keys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); ASSERT_TRUE(reader->KeyMayMatch( @@ -220,7 +220,7 @@ class PartitionedFilterBlockTest // querying missing keys std::vector missing_keys = PrepareKeys(missing_keys_without_ts, kMissingKeyNum); - for (auto key : missing_keys) { + for (const auto& key : missing_keys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); if (empty) { @@ -386,7 +386,7 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { CutABlock(pib.get(), pkeys[2]); std::unique_ptr reader( NewReader(builder.get(), pib.get())); - for (auto key : pkeys) { + for (const auto& key : pkeys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key), @@ -400,7 +400,7 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { "p-key31"}; std::vector pnonkeys = PrepareKeys(pnonkeys_without_ts, 4 /* number_of_keys */); - for (auto key : pnonkeys) { + for (const auto& key : pnonkeys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key), @@ -440,7 +440,7 @@ TEST_P(PartitionedFilterBlockTest, PrefixInWrongPartitionBug) { CutABlock(pib.get(), pkeys[4]); std::unique_ptr reader( NewReader(builder.get(), pib.get())); - for (auto key : pkeys) { + for (const auto& key : pkeys) { auto prefix = prefix_extractor->Transform(key); auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); diff --git a/table/block_based/partitioned_index_iterator.cc b/table/block_based/partitioned_index_iterator.cc index cc6f7013092..d774a78d56e 100644 --- a/table/block_based/partitioned_index_iterator.cc +++ b/table/block_based/partitioned_index_iterator.cc @@ -92,7 +92,7 @@ void PartitionedIndexIterator::InitPartitionedIndexBlock() { block_prefetcher_.PrefetchIfNeeded( rep, partitioned_index_handle, read_options_.readahead_size, is_for_compaction, /*no_sequential_checking=*/false, read_options_, - /*readaheadsize_cb=*/nullptr); + /*readaheadsize_cb=*/nullptr, /*is_async_io_prefetch=*/false); Status s; table_->NewDataBlockIterator( read_options_, partitioned_index_handle, &block_iter_, diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index f825907180a..b76e34beee7 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -167,11 +167,9 @@ Status PartitionIndexReader::CacheDependencies( std::unique_ptr prefetch_buffer; if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled() || tail_prefetch_buffer->GetPrefetchOffset() > prefetch_off) { - rep->CreateFilePrefetchBuffer( - 0, 0, &prefetch_buffer, false /*Implicit auto readahead*/, - 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/, - /*upper_bound_offset*/ 0, /*readaheadsize_cb*/ nullptr, - /*usage=*/FilePrefetchBufferUsage::kUnknown); + rep->CreateFilePrefetchBuffer(ReadaheadParams(), &prefetch_buffer, + /*readaheadsize_cb*/ nullptr, + /*usage=*/FilePrefetchBufferUsage::kUnknown); IOOptions opts; { Status s = rep->file->PrepareIOOptions(ro, opts); diff --git a/table/block_based/reader_common.cc b/table/block_based/reader_common.cc index 7d0c97c717d..8f8c82ff43a 100644 --- a/table/block_based/reader_common.cc +++ b/table/block_based/reader_common.cc @@ -17,8 +17,8 @@ namespace ROCKSDB_NAMESPACE { void ForceReleaseCachedEntry(void* arg, void* h) { - Cache* cache = reinterpret_cast(arg); - Cache::Handle* handle = reinterpret_cast(h); + Cache* cache = static_cast(arg); + Cache::Handle* handle = static_cast(h); cache->Release(handle, true /* erase_if_last_ref */); } diff --git a/table/block_based/reader_common.h b/table/block_based/reader_common.h index 08c2a756bb0..89518fd8c2a 100644 --- a/table/block_based/reader_common.h +++ b/table/block_based/reader_common.h @@ -15,7 +15,7 @@ namespace ROCKSDB_NAMESPACE { class Footer; // Release the cached entry and decrement its ref count. -extern void ForceReleaseCachedEntry(void* arg, void* h); +void ForceReleaseCachedEntry(void* arg, void* h); inline MemoryAllocator* GetMemoryAllocator( const BlockBasedTableOptions& table_options) { @@ -30,8 +30,7 @@ inline MemoryAllocator* GetMemoryAllocator( // // Returns Status::OK() on checksum match, or Status::Corruption() on checksum // mismatch. -extern Status VerifyBlockChecksum(const Footer& footer, const char* data, - size_t block_size, - const std::string& file_name, - uint64_t offset); +Status VerifyBlockChecksum(const Footer& footer, const char* data, + size_t block_size, const std::string& file_name, + uint64_t offset); } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 257a1a42ea8..31b6d93888f 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -76,23 +76,17 @@ inline bool BlockFetcher::TryGetFromPrefetchBuffer() { IOOptions opts; IOStatus io_s = file_->PrepareIOOptions(read_options_, opts); if (io_s.ok()) { - bool read_from_prefetch_buffer = false; - if (read_options_.async_io && !for_compaction_) { - read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCacheAsync( - opts, file_, handle_.offset(), block_size_with_trailer_, &slice_, - &io_s); - } else { - read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCache( - opts, file_, handle_.offset(), block_size_with_trailer_, &slice_, - &io_s, for_compaction_); - } + bool read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCache( + opts, file_, handle_.offset(), block_size_with_trailer_, &slice_, + &io_s, for_compaction_); if (read_from_prefetch_buffer) { ProcessTrailerIfPresent(); - if (!io_status_.ok()) { + if (io_status_.ok()) { + got_from_prefetch_buffer_ = true; + used_buf_ = const_cast(slice_.data()); + } else if (!(io_status_.IsCorruption() && retry_corrupt_read_)) { return true; } - got_from_prefetch_buffer_ = true; - used_buf_ = const_cast(slice_.data()); } } if (!io_s.ok()) { @@ -198,16 +192,21 @@ inline void BlockFetcher::CopyBufferToCompressedBuf() { #endif } -// Entering this method means the block is not compressed or do not need to be -// uncompressed. The block can be in one of the following buffers: +// Before - Entering this method means the block is uncompressed or do not need +// to be uncompressed. +// +// The block can be in one of the following buffers: // 1. prefetch buffer if prefetch is enabled and the block is prefetched before // 2. stack_buf_ if block size is smaller than the stack_buf_ size and block // is not compressed // 3. heap_buf_ if the block is not compressed // 4. compressed_buf_ if the block is compressed -// 5. direct_io_buf_ if direct IO is enabled -// After this method, if the block is compressed, it should be in -// compressed_buf_, otherwise should be in heap_buf_. +// 5. direct_io_buf_ if direct IO is enabled or +// 6. underlying file_system scratch is used (FSReadRequest.fs_scratch). +// +// After - After this method, if the block is compressed, it should be in +// compressed_buf_ and heap_buf_ points to compressed_buf_, otherwise should be +// in heap_buf_. inline void BlockFetcher::GetBlockContents() { if (slice_.data() != used_buf_) { // the slice content is not the buffer provided @@ -224,7 +223,7 @@ inline void BlockFetcher::GetBlockContents() { } else { heap_buf_ = std::move(compressed_buf_); } - } else if (direct_io_buf_.get() != nullptr) { + } else if (direct_io_buf_.get() != nullptr || use_fs_scratch_) { if (compression_type_ == kNoCompression) { CopyBufferToHeapBuf(); } else { @@ -239,88 +238,137 @@ inline void BlockFetcher::GetBlockContents() { #endif } -IOStatus BlockFetcher::ReadBlockContents() { - if (TryGetUncompressBlockFromPersistentCache()) { - compression_type_ = kNoCompression; -#ifndef NDEBUG - contents_->has_trailer = footer_.GetBlockTrailerSize() > 0; -#endif // NDEBUG - return IOStatus::OK(); - } - if (TryGetFromPrefetchBuffer()) { - if (!io_status_.ok()) { - return io_status_; - } - } else if (!TryGetSerializedBlockFromPersistentCache()) { - IOOptions opts; - io_status_ = file_->PrepareIOOptions(read_options_, opts); - // Actual file read - if (io_status_.ok()) { - if (file_->use_direct_io()) { - PERF_TIMER_GUARD(block_read_time); - PERF_CPU_TIMER_GUARD(block_read_cpu_time, nullptr); - io_status_ = - file_->Read(opts, handle_.offset(), block_size_with_trailer_, - &slice_, nullptr, &direct_io_buf_); - PERF_COUNTER_ADD(block_read_count, 1); - used_buf_ = const_cast(slice_.data()); - } else { - PrepareBufferForBlockFromFile(); - PERF_TIMER_GUARD(block_read_time); - PERF_CPU_TIMER_GUARD(block_read_cpu_time, nullptr); - io_status_ = - file_->Read(opts, handle_.offset(), block_size_with_trailer_, - &slice_, used_buf_, nullptr); - PERF_COUNTER_ADD(block_read_count, 1); +// Read a block from the file and verify its checksum. Upon return, io_status_ +// will be updated with the status of the read, and slice_ will be updated +// with a pointer to the data. +void BlockFetcher::ReadBlock(bool retry, FSAllocationPtr& fs_buf) { + FSReadRequest read_req; + IOOptions opts; + io_status_ = file_->PrepareIOOptions(read_options_, opts); + opts.verify_and_reconstruct_read = retry; + read_req.status.PermitUncheckedError(); + // Actual file read + if (io_status_.ok()) { + if (file_->use_direct_io()) { + PERF_TIMER_GUARD(block_read_time); + PERF_CPU_TIMER_GUARD( + block_read_cpu_time, + ioptions_.env ? ioptions_.env->GetSystemClock().get() : nullptr); + io_status_ = file_->Read(opts, handle_.offset(), block_size_with_trailer_, + &slice_, /*scratch=*/nullptr, &direct_io_buf_); + PERF_COUNTER_ADD(block_read_count, 1); + used_buf_ = const_cast(slice_.data()); + } else if (use_fs_scratch_) { + PERF_TIMER_GUARD(block_read_time); + PERF_CPU_TIMER_GUARD( + block_read_cpu_time, + ioptions_.env ? ioptions_.env->GetSystemClock().get() : nullptr); + read_req.offset = handle_.offset(); + read_req.len = block_size_with_trailer_; + read_req.scratch = nullptr; + io_status_ = file_->MultiRead(opts, &read_req, /*num_reqs=*/1, + /*AlignedBuf* =*/nullptr); + PERF_COUNTER_ADD(block_read_count, 1); + + slice_ = Slice(read_req.result.data(), read_req.result.size()); + used_buf_ = const_cast(slice_.data()); + } else { + // It allocates/assign used_buf_ + PrepareBufferForBlockFromFile(); + + PERF_TIMER_GUARD(block_read_time); + PERF_CPU_TIMER_GUARD( + block_read_cpu_time, + ioptions_.env ? ioptions_.env->GetSystemClock().get() : nullptr); + + io_status_ = file_->Read( + opts, handle_.offset(), /*size*/ block_size_with_trailer_, + /*result*/ &slice_, /*scratch*/ used_buf_, /*aligned_buf=*/nullptr); + PERF_COUNTER_ADD(block_read_count, 1); #ifndef NDEBUG - if (slice_.data() == &stack_buf_[0]) { - num_stack_buf_memcpy_++; - } else if (slice_.data() == heap_buf_.get()) { - num_heap_buf_memcpy_++; - } else if (slice_.data() == compressed_buf_.get()) { - num_compressed_buf_memcpy_++; - } -#endif + if (slice_.data() == &stack_buf_[0]) { + num_stack_buf_memcpy_++; + } else if (slice_.data() == heap_buf_.get()) { + num_heap_buf_memcpy_++; + } else if (slice_.data() == compressed_buf_.get()) { + num_compressed_buf_memcpy_++; } +#endif } + } - // TODO: introduce dedicated perf counter for range tombstones - switch (block_type_) { - case BlockType::kFilter: - case BlockType::kFilterPartitionIndex: - PERF_COUNTER_ADD(filter_block_read_count, 1); - break; - - case BlockType::kCompressionDictionary: - PERF_COUNTER_ADD(compression_dict_block_read_count, 1); - break; + // TODO: introduce dedicated perf counter for range tombstones + switch (block_type_) { + case BlockType::kFilter: + case BlockType::kFilterPartitionIndex: + PERF_COUNTER_ADD(filter_block_read_count, 1); + break; - case BlockType::kIndex: - PERF_COUNTER_ADD(index_block_read_count, 1); - break; + case BlockType::kCompressionDictionary: + PERF_COUNTER_ADD(compression_dict_block_read_count, 1); + break; - // Nothing to do here as we don't have counters for the other types. - default: - break; - } + case BlockType::kIndex: + PERF_COUNTER_ADD(index_block_read_count, 1); + break; - PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_); - if (!io_status_.ok()) { - return io_status_; - } + // Nothing to do here as we don't have counters for the other types. + default: + break; + } - if (slice_.size() != block_size_with_trailer_) { - return IOStatus::Corruption( + PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_); + if (io_status_.ok()) { + if (use_fs_scratch_ && !read_req.status.ok()) { + io_status_ = read_req.status; + } else if (slice_.size() != block_size_with_trailer_) { + io_status_ = IOStatus::Corruption( "truncated block read from " + file_->file_name() + " offset " + std::to_string(handle_.offset()) + ", expected " + std::to_string(block_size_with_trailer_) + " bytes, got " + std::to_string(slice_.size())); } + } + if (io_status_.ok()) { ProcessTrailerIfPresent(); - if (io_status_.ok()) { - InsertCompressedBlockToPersistentCacheIfNeeded(); - } else { + } + + if (io_status_.ok()) { + InsertCompressedBlockToPersistentCacheIfNeeded(); + fs_buf = std::move(read_req.fs_scratch); + } else { + ReleaseFileSystemProvidedBuffer(&read_req); + direct_io_buf_.reset(); + compressed_buf_.reset(); + heap_buf_.reset(); + used_buf_ = nullptr; + } +} + +IOStatus BlockFetcher::ReadBlockContents() { + FSAllocationPtr fs_buf; + if (TryGetUncompressBlockFromPersistentCache()) { + compression_type_ = kNoCompression; +#ifndef NDEBUG + contents_->has_trailer = footer_.GetBlockTrailerSize() > 0; +#endif // NDEBUG + return IOStatus::OK(); + } + if (TryGetFromPrefetchBuffer()) { + if (!io_status_.ok()) { + return io_status_; + } + } else if (!TryGetSerializedBlockFromPersistentCache()) { + ReadBlock(/*retry =*/false, fs_buf); + // If the file system supports retry after corruption, then try to + // re-read the block and see if it succeeds. + if (io_status_.IsCorruption() && retry_corrupt_read_) { + assert(!fs_buf); + ReadBlock(/*retry=*/true, fs_buf); + } + if (!io_status_.ok()) { + assert(!fs_buf); return io_status_; } } @@ -369,10 +417,16 @@ IOStatus BlockFetcher::ReadAsyncBlockContents() { return io_s; } if (io_s.ok()) { + FSAllocationPtr fs_buf; // Data Block is already in prefetch. got_from_prefetch_buffer_ = true; ProcessTrailerIfPresent(); + if (io_status_.IsCorruption() && retry_corrupt_read_) { + got_from_prefetch_buffer_ = false; + ReadBlock(/*retry = */ true, fs_buf); + } if (!io_status_.ok()) { + assert(!fs_buf); return io_status_; } used_buf_ = const_cast(slice_.data()); diff --git a/table/block_fetcher.h b/table/block_fetcher.h index e5a51e3eb25..46e643f91b2 100644 --- a/table/block_fetcher.h +++ b/table/block_fetcher.h @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include "file/file_util.h" #include "memory/memory_allocator_impl.h" #include "table/block_based/block.h" #include "table/block_based/block_type.h" @@ -68,6 +69,13 @@ class BlockFetcher { memory_allocator_compressed_(memory_allocator_compressed), for_compaction_(for_compaction) { io_status_.PermitUncheckedError(); // TODO(AR) can we improve on this? + if (CheckFSFeatureSupport(ioptions_.fs.get(), FSSupportedOps::kFSBuffer)) { + use_fs_scratch_ = true; + } + if (CheckFSFeatureSupport(ioptions_.fs.get(), + FSSupportedOps::kVerifyAndReconstructRead)) { + retry_corrupt_read_ = true; + } } IOStatus ReadBlockContents(); @@ -127,6 +135,8 @@ class BlockFetcher { bool got_from_prefetch_buffer_ = false; CompressionType compression_type_; bool for_compaction_ = false; + bool use_fs_scratch_ = false; + bool retry_corrupt_read_ = false; // return true if found bool TryGetUncompressBlockFromPersistentCache(); @@ -142,5 +152,16 @@ class BlockFetcher { void InsertCompressedBlockToPersistentCacheIfNeeded(); void InsertUncompressedBlockToPersistentCacheIfNeeded(); void ProcessTrailerIfPresent(); + void ReadBlock(bool retry, FSAllocationPtr& fs_buf); + + void ReleaseFileSystemProvidedBuffer(FSReadRequest* read_req) { + if (use_fs_scratch_) { + // Free the scratch buffer allocated by FileSystem. + if (read_req->fs_scratch != nullptr) { + read_req->fs_scratch.reset(); + read_req->fs_scratch = nullptr; + } + } + } }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc index d738fa3df8a..ca8fcb7e505 100644 --- a/table/block_fetcher_test.cc +++ b/table/block_fetcher_test.cc @@ -76,12 +76,14 @@ class BlockFetcherTest : public testing::Test { InternalKeyComparator comparator(options_.comparator); ColumnFamilyOptions cf_options(options_); MutableCFOptions moptions(cf_options); - IntTblPropCollectorFactories factories; + InternalTblPropCollFactories factories; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr table_builder(table_factory_.NewTableBuilder( - TableBuilderOptions(ioptions, moptions, comparator, &factories, - compression_type, CompressionOptions(), - 0 /* column_family_id */, kDefaultColumnFamilyName, - -1 /* level */), + TableBuilderOptions(ioptions, moptions, read_options, write_options, + comparator, &factories, compression_type, + CompressionOptions(), 0 /* column_family_id */, + kDefaultColumnFamilyName, -1 /* level */), writer.get())); // Build table. @@ -105,11 +107,16 @@ class BlockFetcherTest : public testing::Test { // Get handle of the index block. Footer footer; - ReadFooter(file.get(), &footer); - const BlockHandle& index_handle = footer.index_handle(); - // FIXME: index handle will need to come from metaindex for - // format_version >= 6 when that becomes the default - ASSERT_FALSE(index_handle.IsNull()); + uint64_t file_size = 0; + ReadFooter(file.get(), &footer, &file_size); + + // Index handle comes from metaindex for format_version >= 6 + ASSERT_TRUE(footer.index_handle().IsNull()); + + BlockHandle index_handle; + ASSERT_OK(FindMetaBlockInFile( + file.get(), file_size, kBlockBasedTableMagicNumber, + ImmutableOptions(options_), {}, kIndexBlockName, &index_handle)); CompressionType compression_type; FetchBlock(file.get(), index_handle, BlockType::kIndex, @@ -134,7 +141,9 @@ class BlockFetcherTest : public testing::Test { std::array expected_stats_by_mode) { for (CompressionType compression_type : GetSupportedCompressions()) { bool do_compress = compression_type != kNoCompression; - if (compressed != do_compress) continue; + if (compressed != do_compress) { + continue; + } std::string compression_type_str = CompressionTypeToString(compression_type); @@ -274,7 +283,7 @@ class BlockFetcherTest : public testing::Test { 0 /* block_protection_bytes_per_key */, &table_reader, 0 /* tail_size */)); - table->reset(reinterpret_cast(table_reader.release())); + table->reset(static_cast(table_reader.release())); } std::string ToInternalKey(const std::string& key) { @@ -282,13 +291,17 @@ class BlockFetcherTest : public testing::Test { return internal_key.Encode().ToString(); } - void ReadFooter(RandomAccessFileReader* file, Footer* footer) { + void ReadFooter(RandomAccessFileReader* file, Footer* footer, + uint64_t* file_size_out = nullptr) { uint64_t file_size = 0; ASSERT_OK(env_->GetFileSize(file->file_name(), &file_size)); IOOptions opts; ASSERT_OK(ReadFooterFromFile(opts, file, *fs_, nullptr /* prefetch_buffer */, file_size, footer, kBlockBasedTableMagicNumber)); + if (file_size_out) { + *file_size_out = file_size; + } } // NOTE: compression_type returns the compression type of the fetched block diff --git a/table/cuckoo/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc index 0cf6834af81..66ec2b094ff 100644 --- a/table/cuckoo/cuckoo_table_builder.cc +++ b/table/cuckoo/cuckoo_table_builder.cc @@ -5,9 +5,8 @@ #include "table/cuckoo/cuckoo_table_builder.h" -#include - #include +#include #include #include #include @@ -45,7 +44,7 @@ const std::string CuckooTablePropertyNames::kUserKeyLength = "rocksdb.cuckoo.hash.userkeylength"; // Obtained by running echo rocksdb.table.cuckoo | sha1sum -extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; +const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; CuckooTableBuilder::CuckooTableBuilder( WritableFileWriter* file, double max_hash_table_ratio, @@ -319,15 +318,16 @@ Status CuckooTableBuilder::Finish() { unused_bucket.resize(static_cast(bucket_size), 'a'); // Write the table. uint32_t num_added = 0; + const IOOptions opts; for (auto& bucket : buckets) { if (bucket.vector_idx == kMaxVectorIdx) { - io_status_ = file_->Append(Slice(unused_bucket)); + io_status_ = file_->Append(opts, Slice(unused_bucket)); } else { ++num_added; - io_status_ = file_->Append(GetKey(bucket.vector_idx)); + io_status_ = file_->Append(opts, GetKey(bucket.vector_idx)); if (io_status_.ok()) { if (value_size_ > 0) { - io_status_ = file_->Append(GetValue(bucket.vector_idx)); + io_status_ = file_->Append(opts, GetValue(bucket.vector_idx)); } } } @@ -383,7 +383,7 @@ Status CuckooTableBuilder::Finish() { BlockHandle property_block_handle; property_block_handle.set_offset(offset); property_block_handle.set_size(property_block.size()); - io_status_ = file_->Append(property_block); + io_status_ = file_->Append(opts, property_block); offset += property_block.size(); if (!io_status_.ok()) { status_ = io_status_; @@ -396,7 +396,7 @@ Status CuckooTableBuilder::Finish() { BlockHandle meta_index_block_handle; meta_index_block_handle.set_offset(offset); meta_index_block_handle.set_size(meta_index_block.size()); - io_status_ = file_->Append(meta_index_block); + io_status_ = file_->Append(opts, meta_index_block); if (!io_status_.ok()) { status_ = io_status_; return status_; @@ -409,7 +409,7 @@ Status CuckooTableBuilder::Finish() { status_ = s; return status_; } - io_status_ = file_->Append(footer.GetSlice()); + io_status_ = file_->Append(opts, footer.GetSlice()); status_ = io_status_; return status_; } @@ -481,7 +481,7 @@ bool CuckooTableBuilder::MakeSpaceForKey( uint64_t bid = hash_vals[hash_cnt]; (*buckets)[static_cast(bid)].make_space_for_key_call_id = make_space_for_key_call_id; - tree.push_back(CuckooNode(bid, 0, 0)); + tree.emplace_back(bid, 0, 0); } bool null_found = false; uint32_t curr_pos = 0; @@ -507,7 +507,7 @@ bool CuckooTableBuilder::MakeSpaceForKey( } (*buckets)[static_cast(child_bucket_id)] .make_space_for_key_call_id = make_space_for_key_call_id; - tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1, curr_pos)); + tree.emplace_back(child_bucket_id, curr_depth + 1, curr_pos); if ((*buckets)[static_cast(child_bucket_id)].vector_idx == kMaxVectorIdx) { null_found = true; diff --git a/table/cuckoo/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc index 1a0d58c76d1..967e8e2db7b 100644 --- a/table/cuckoo/cuckoo_table_builder_test.cc +++ b/table/cuckoo/cuckoo_table_builder_test.cc @@ -182,7 +182,7 @@ TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) { ASSERT_OK(builder.status()); ASSERT_EQ(0UL, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); CheckFileContents({}, {}, {}, "", 2, 2, false); } @@ -229,7 +229,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -277,7 +277,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -325,7 +325,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -374,7 +374,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -420,7 +420,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -463,7 +463,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { size_t bucket_size = user_keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = "key00"; @@ -507,7 +507,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { size_t bucket_size = user_keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = "key00"; @@ -550,7 +550,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) { size_t bucket_size = user_keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = "key00"; @@ -589,7 +589,7 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) { ASSERT_OK(builder.status()); } ASSERT_TRUE(builder.Finish().IsNotSupported()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); } TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { @@ -619,7 +619,7 @@ TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { ASSERT_OK(builder.status()); ASSERT_TRUE(builder.Finish().IsNotSupported()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); } } // namespace ROCKSDB_NAMESPACE diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc index a4479ab60cd..5be1ebc19ec 100644 --- a/table/cuckoo/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -43,7 +43,7 @@ CuckooTableReader::CuckooTableReader( identity_as_first_hash_(false), use_module_hash_(false), num_hash_func_(0), - unused_key_(""), + key_length_(0), user_key_length_(0), value_length_(0), @@ -59,7 +59,7 @@ CuckooTableReader::CuckooTableReader( } { std::unique_ptr props; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, @@ -182,9 +182,14 @@ Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, ParsedInternalKey found_ikey; Status s = ParseInternalKey(full_key, &found_ikey, false /* log_err_key */); // TODO - if (!s.ok()) return s; + if (!s.ok()) { + return s; + } bool dont_care __attribute__((__unused__)); - get_context->SaveValue(found_ikey, value, &dont_care); + get_context->SaveValue(found_ikey, value, &dont_care, &s); + if (!s.ok()) { + return s; + } } // We don't support merge operations. So, we return here. return Status::OK(); @@ -213,7 +218,7 @@ class CuckooTableIterator : public InternalIterator { // No copying allowed CuckooTableIterator(const CuckooTableIterator&) = delete; void operator=(const Iterator&) = delete; - ~CuckooTableIterator() override {} + ~CuckooTableIterator() override = default; bool Valid() const override; void SeekToFirst() override; void SeekToLast() override; diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc index e83baa10779..25e2c1bca4b 100644 --- a/table/cuckoo/cuckoo_table_reader_test.cc +++ b/table/cuckoo/cuckoo_table_reader_test.cc @@ -104,7 +104,7 @@ class CuckooReaderTest : public testing::Test { ASSERT_OK(builder.Finish()); ASSERT_EQ(num_items, builder.NumEntries()); file_size = builder.FileSize(); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); // Check reader now. std::unique_ptr file_reader; @@ -249,7 +249,7 @@ TEST_F(CuckooReaderTest, WhenKeyExistsWithUint64Comparator) { fname = test::PerThreadDBPath("CuckooReaderUint64_WhenKeyExists"); for (uint64_t i = 0; i < num_items; i++) { user_keys[i].resize(8); - memcpy(&user_keys[i][0], static_cast(&i), 8); + memcpy(user_keys[i].data(), static_cast(&i), 8); ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); AppendInternalKey(&keys[i], ikey); values[i] = "value" + NumToStr(i); @@ -296,7 +296,7 @@ TEST_F(CuckooReaderTest, CheckIteratorUint64) { fname = test::PerThreadDBPath("CuckooReader_CheckIterator"); for (uint64_t i = 0; i < num_items; i++) { user_keys[i].resize(8); - memcpy(&user_keys[i][0], static_cast(&i), 8); + memcpy(user_keys[i].data(), static_cast(&i), 8); ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue); AppendInternalKey(&keys[i], ikey); values[i] = "value" + NumToStr(i); @@ -425,13 +425,13 @@ void WriteFile(const std::vector& keys, const uint64_t num, ASSERT_OK(builder.status()); for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { // Value is just a part of key. - builder.Add(Slice(keys[key_idx]), Slice(&keys[key_idx][0], 4)); + builder.Add(Slice(keys[key_idx]), Slice(keys[key_idx].data(), 4)); ASSERT_EQ(builder.NumEntries(), key_idx + 1); ASSERT_OK(builder.status()); } ASSERT_OK(builder.Finish()); ASSERT_EQ(num, builder.NumEntries()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); uint64_t file_size; ASSERT_OK( @@ -454,7 +454,7 @@ void WriteFile(const std::vector& keys, const uint64_t num, value.Reset(); value.clear(); ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context, nullptr)); - ASSERT_TRUE(Slice(keys[i]) == Slice(&keys[i][0], 4)); + ASSERT_TRUE(Slice(keys[i]) == Slice(keys[i].data(), 4)); } } @@ -571,4 +571,3 @@ int main(int argc, char** argv) { } #endif // GFLAGS. - diff --git a/table/format.cc b/table/format.cc index 27ecce54724..627e5eac1bc 100644 --- a/table/format.cc +++ b/table/format.cc @@ -38,11 +38,6 @@ namespace ROCKSDB_NAMESPACE { -extern const uint64_t kLegacyBlockBasedTableMagicNumber; -extern const uint64_t kBlockBasedTableMagicNumber; - -extern const uint64_t kLegacyPlainTableMagicNumber; -extern const uint64_t kPlainTableMagicNumber; const char* kHostnameForDbHostId = "__hostname__"; bool ShouldReportDetailedTime(Env* env, Statistics* stats) { @@ -390,7 +385,7 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset, if (checksum_type_ != kNoChecksum && format_version_ >= 6) { std::array copy_without_checksum; std::copy_n(input.data(), kNewVersionsEncodedLength, - ©_without_checksum[0]); + copy_without_checksum.data()); EncodeFixed32(©_without_checksum[5], 0); // Clear embedded checksum computed_checksum = ComputeBuiltinChecksum(checksum_type(), copy_without_checksum.data(), @@ -466,19 +461,16 @@ std::string Footer::ToString() const { std::string result; result.reserve(1024); - bool legacy = IsLegacyFooterFormat(table_magic_number_); - if (legacy) { - result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); - result.append("index handle: " + index_handle_.ToString() + "\n "); - result.append("table_magic_number: " + std::to_string(table_magic_number_) + - "\n "); - } else { - result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); - result.append("index handle: " + index_handle_.ToString() + "\n "); - result.append("table_magic_number: " + std::to_string(table_magic_number_) + - "\n "); - result.append("format version: " + std::to_string(format_version_) + - "\n "); + result.append("metaindex handle: " + metaindex_handle_.ToString() + + " offset: " + std::to_string(metaindex_handle_.offset()) + + " size: " + std::to_string(metaindex_handle_.size()) + "\n "); + result.append("index handle: " + index_handle_.ToString() + + " offset: " + std::to_string(index_handle_.offset()) + + " size: " + std::to_string(index_handle_.size()) + "\n "); + result.append("table_magic_number: " + std::to_string(table_magic_number_) + + "\n "); + if (!IsLegacyFooterFormat(table_magic_number_)) { + result.append("format version: " + std::to_string(format_version_) + "\n"); } return result; } @@ -518,9 +510,11 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, } else { footer_buf.reserve(Footer::kMaxEncodedLength); s = file->Read(opts, read_offset, Footer::kMaxEncodedLength, - &footer_input, &footer_buf[0], nullptr); + &footer_input, footer_buf.data(), nullptr); + } + if (!s.ok()) { + return s; } - if (!s.ok()) return s; } // Check that we actually read the whole footer from the file. It may be diff --git a/table/format.h b/table/format.h index 73675381edb..cbd6d08fa3b 100644 --- a/table/format.h +++ b/table/format.h @@ -34,6 +34,14 @@ bool ShouldReportDetailedTime(Env* env, Statistics* stats); // the length of the magic number in bytes. constexpr uint32_t kMagicNumberLengthByte = 8; +extern const uint64_t kLegacyBlockBasedTableMagicNumber; +extern const uint64_t kBlockBasedTableMagicNumber; + +extern const uint64_t kLegacyPlainTableMagicNumber; +extern const uint64_t kPlainTableMagicNumber; + +extern const uint64_t kCuckooTableMagicNumber; + // BlockHandle is a pointer to the extent of a file that stores a data // block or a meta block. class BlockHandle { diff --git a/table/get_context.cc b/table/get_context.cc index 660726cd392..763bd8d197e 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -19,22 +19,6 @@ namespace ROCKSDB_NAMESPACE { -namespace { - -void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) { - if (replay_log) { - if (replay_log->empty()) { - // Optimization: in the common case of only one operation in the - // log, we allocate the exact amount of space needed. - replay_log->reserve(1 + VarintLength(value.size()) + value.size()); - } - replay_log->push_back(type); - PutLengthPrefixedSlice(replay_log, value); - } -} - -} // namespace - GetContext::GetContext( const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, Statistics* statistics, GetState init_state, const Slice& user_key, @@ -88,6 +72,24 @@ GetContext::GetContext(const Comparator* ucmp, seq, _pinned_iters_mgr, callback, is_blob_index, tracing_get_id, blob_fetcher) {} +void GetContext::appendToReplayLog(ValueType type, Slice value, Slice ts) { + if (replay_log_) { + if (replay_log_->empty()) { + // Optimization: in the common case of only one operation in the + // log, we allocate the exact amount of space needed. + replay_log_->reserve(1 + VarintLength(value.size()) + value.size()); + } + replay_log_->push_back(type); + PutLengthPrefixedSlice(replay_log_, value); + + // If cf enables ts, there should always be a ts following each value + if (ucmp_->timestamp_size() > 0) { + assert(ts.size() == ucmp_->timestamp_size()); + PutLengthPrefixedSlice(replay_log_, ts); + } + } +} + // Called from TableCache::Get and Table::Get when file/block in which // key may exist are not there in TableCache/BlockCache respectively. In this // case we can't guarantee that key does not exist and are not permitted to do @@ -102,7 +104,9 @@ void GetContext::MarkKeyMayExist() { void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) { assert(state_ == kNotFound); - appendToReplayLog(replay_log_, kTypeValue, value); + assert(ucmp_->timestamp_size() == 0); + + appendToReplayLog(kTypeValue, value, Slice()); state_ = kFound; if (LIKELY(pinnable_val_ != nullptr)) { @@ -217,7 +221,7 @@ void GetContext::ReportCounters() { bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, const Slice& value, bool* matched, - Cleanable* value_pinner) { + Status* read_status, Cleanable* value_pinner) { assert(matched); assert((state_ != kMerge && parsed_key.type != kTypeMerge) || merge_context_ != nullptr); @@ -228,8 +232,6 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, return true; // to continue to the next seq } - appendToReplayLog(replay_log_, parsed_key.type, value); - if (seq_ != nullptr) { // Set the sequence number if it is uninitialized if (*seq_ == kMaxSequenceNumber) { @@ -241,36 +243,43 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, } size_t ts_sz = ucmp_->timestamp_size(); - if (ts_sz > 0 && timestamp_ != nullptr) { - if (!timestamp_->empty()) { - assert(ts_sz == timestamp_->size()); - // `timestamp` can be set before `SaveValue` is ever called - // when max_covering_tombstone_seq_ was set. - // If this key has a higher sequence number than range tombstone, - // then timestamp should be updated. `ts_from_rangetombstone_` is - // set to false afterwards so that only the key with highest seqno - // updates the timestamp. - if (ts_from_rangetombstone_) { - assert(max_covering_tombstone_seq_); - if (parsed_key.sequence > *max_covering_tombstone_seq_) { - Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz); - timestamp_->assign(ts.data(), ts.size()); - ts_from_rangetombstone_ = false; + Slice ts; + + if (ts_sz > 0) { + // ensure always have ts if cf enables ts. + ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz); + if (timestamp_ != nullptr) { + if (!timestamp_->empty()) { + assert(ts_sz == timestamp_->size()); + // `timestamp` can be set before `SaveValue` is ever called + // when max_covering_tombstone_seq_ was set. + // If this key has a higher sequence number than range tombstone, + // then timestamp should be updated. `ts_from_rangetombstone_` is + // set to false afterwards so that only the key with highest seqno + // updates the timestamp. + if (ts_from_rangetombstone_) { + assert(max_covering_tombstone_seq_); + if (parsed_key.sequence > *max_covering_tombstone_seq_) { + timestamp_->assign(ts.data(), ts.size()); + ts_from_rangetombstone_ = false; + } } } - } - // TODO optimize for small size ts - const std::string kMaxTs(ts_sz, '\xff'); - if (timestamp_->empty() || - ucmp_->CompareTimestamp(*timestamp_, kMaxTs) == 0) { - Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz); - timestamp_->assign(ts.data(), ts.size()); + // TODO optimize for small size ts + const std::string kMaxTs(ts_sz, '\xff'); + if (timestamp_->empty() || + ucmp_->CompareTimestamp(*timestamp_, kMaxTs) == 0) { + timestamp_->assign(ts.data(), ts.size()); + } } } + appendToReplayLog(parsed_key.type, value, ts); auto type = parsed_key.type; + Slice unpacked_value = value; // Key matches. Process it - if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex || + if ((type == kTypeValue || type == kTypeValuePreferredSeqno || + type == kTypeMerge || type == kTypeBlobIndex || type == kTypeWideColumnEntity || type == kTypeDeletion || type == kTypeDeletionWithTimestamp || type == kTypeSingleDeletion) && max_covering_tombstone_seq_ != nullptr && @@ -282,9 +291,13 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, } switch (type) { case kTypeValue: + case kTypeValuePreferredSeqno: case kTypeBlobIndex: case kTypeWideColumnEntity: assert(state_ == kNotFound || state_ == kMerge); + if (type == kTypeValuePreferredSeqno) { + unpacked_value = ParsePackedValueForValue(value); + } if (type == kTypeBlobIndex) { if (is_blob_index_ == nullptr) { // Blob value not supported. Stop. @@ -304,10 +317,10 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, ukey_with_ts_found_.PinSelf(parsed_key.user_key); } if (LIKELY(pinnable_val_ != nullptr)) { - Slice value_to_use = value; + Slice value_to_use = unpacked_value; if (type == kTypeWideColumnEntity) { - Slice value_copy = value; + Slice value_copy = unpacked_value; if (!WideColumnSerialization::GetValueOfDefaultColumn( value_copy, value_to_use) @@ -328,12 +341,13 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, } } else if (columns_ != nullptr) { if (type == kTypeWideColumnEntity) { - if (!columns_->SetWideColumnValue(value, value_pinner).ok()) { + if (!columns_->SetWideColumnValue(unpacked_value, value_pinner) + .ok()) { state_ = kCorrupt; return false; } } else { - columns_->SetPlainValue(value, value_pinner); + columns_->SetPlainValue(unpacked_value, value_pinner); } } } else { @@ -342,13 +356,14 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, // merge_context_->operand_list if (type == kTypeBlobIndex) { PinnableSlice pin_val; - if (GetBlobValue(parsed_key.user_key, value, &pin_val) == false) { + if (GetBlobValue(parsed_key.user_key, unpacked_value, &pin_val, + read_status) == false) { return false; } Slice blob_value(pin_val); push_operand(blob_value, nullptr); } else if (type == kTypeWideColumnEntity) { - Slice value_copy = value; + Slice value_copy = unpacked_value; Slice value_of_default; if (!WideColumnSerialization::GetValueOfDefaultColumn( @@ -360,15 +375,16 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, push_operand(value_of_default, value_pinner); } else { - assert(type == kTypeValue); - push_operand(value, value_pinner); + assert(type == kTypeValue || type == kTypeValuePreferredSeqno); + push_operand(unpacked_value, value_pinner); } } } else if (kMerge == state_) { assert(merge_operator_ != nullptr); if (type == kTypeBlobIndex) { PinnableSlice pin_val; - if (GetBlobValue(parsed_key.user_key, value, &pin_val) == false) { + if (GetBlobValue(parsed_key.user_key, unpacked_value, &pin_val, + read_status) == false) { return false; } Slice blob_value(pin_val); @@ -385,12 +401,12 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, state_ = kFound; if (do_merge_) { - MergeWithWideColumnBaseValue(value); + MergeWithWideColumnBaseValue(unpacked_value); } else { // It means this function is called as part of DB GetMergeOperands // API and the current value should be part of // merge_context_->operand_list - Slice value_copy = value; + Slice value_copy = unpacked_value; Slice value_of_default; if (!WideColumnSerialization::GetValueOfDefaultColumn( @@ -403,16 +419,16 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, push_operand(value_of_default, value_pinner); } } else { - assert(type == kTypeValue); + assert(type == kTypeValue || type == kTypeValuePreferredSeqno); state_ = kFound; if (do_merge_) { - MergeWithPlainBaseValue(value); + MergeWithPlainBaseValue(unpacked_value); } else { // It means this function is called as part of DB GetMergeOperands // API and the current value should be part of // merge_context_->operand_list - push_operand(value, value_pinner); + push_operand(unpacked_value, value_pinner); } } } @@ -451,6 +467,13 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, MergeWithNoBaseValue(); return false; } + if (merge_context_->get_merge_operands_options != nullptr && + merge_context_->get_merge_operands_options->continue_cb != + nullptr && + !merge_context_->get_merge_operands_options->continue_cb(value)) { + state_ = kFound; + return false; + } return true; default: @@ -488,9 +511,8 @@ void GetContext::MergeWithNoBaseValue() { const Status s = MergeHelper::TimedFullMerge( merge_operator_, user_key_, MergeHelper::kNoBaseValue, merge_context_->GetOperands(), logger_, statistics_, clock_, - /* update_num_ops_stats */ true, - pinnable_val_ ? pinnable_val_->GetSelf() : nullptr, columns_, - /* op_failure_scope */ nullptr); + /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr, + pinnable_val_ ? pinnable_val_->GetSelf() : nullptr, columns_); PostprocessMerge(s); } @@ -504,9 +526,8 @@ void GetContext::MergeWithPlainBaseValue(const Slice& value) { const Status s = MergeHelper::TimedFullMerge( merge_operator_, user_key_, MergeHelper::kPlainBaseValue, value, merge_context_->GetOperands(), logger_, statistics_, clock_, - /* update_num_ops_stats */ true, - pinnable_val_ ? pinnable_val_->GetSelf() : nullptr, columns_, - /* op_failure_scope */ nullptr); + /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr, + pinnable_val_ ? pinnable_val_->GetSelf() : nullptr, columns_); PostprocessMerge(s); } @@ -520,21 +541,20 @@ void GetContext::MergeWithWideColumnBaseValue(const Slice& entity) { const Status s = MergeHelper::TimedFullMerge( merge_operator_, user_key_, MergeHelper::kWideBaseValue, entity, merge_context_->GetOperands(), logger_, statistics_, clock_, - /* update_num_ops_stats */ true, - pinnable_val_ ? pinnable_val_->GetSelf() : nullptr, columns_, - /* op_failure_scope */ nullptr); + /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr, + pinnable_val_ ? pinnable_val_->GetSelf() : nullptr, columns_); PostprocessMerge(s); } bool GetContext::GetBlobValue(const Slice& user_key, const Slice& blob_index, - PinnableSlice* blob_value) { + PinnableSlice* blob_value, Status* read_status) { constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; constexpr uint64_t* bytes_read = nullptr; - Status status = blob_fetcher_->FetchBlob( - user_key, blob_index, prefetch_buffer, blob_value, bytes_read); - if (!status.ok()) { - if (status.IsIncomplete()) { + *read_status = blob_fetcher_->FetchBlob(user_key, blob_index, prefetch_buffer, + blob_value, bytes_read); + if (!read_status->ok()) { + if (read_status->IsIncomplete()) { // FIXME: this code is not covered by unit tests MarkKeyMayExist(); return false; @@ -557,23 +577,46 @@ void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) { } } -void replayGetContextLog(const Slice& replay_log, const Slice& user_key, - GetContext* get_context, Cleanable* value_pinner, - SequenceNumber seq_no) { +Status replayGetContextLog(const Slice& replay_log, const Slice& user_key, + GetContext* get_context, Cleanable* value_pinner, + SequenceNumber seq_no) { Slice s = replay_log; + Slice ts; + size_t ts_sz = get_context->TimestampSize(); + bool ret = false; + while (s.size()) { auto type = static_cast(*s.data()); s.remove_prefix(1); Slice value; - bool ret = GetLengthPrefixedSlice(&s, &value); + ret = GetLengthPrefixedSlice(&s, &value); assert(ret); - (void)ret; bool dont_care __attribute__((__unused__)); - ParsedInternalKey ikey = ParsedInternalKey(user_key, seq_no, type); - get_context->SaveValue(ikey, value, &dont_care, value_pinner); + // Use a copy to prevent modifying user_key. Modification of user_key + // could result to potential cache miss. + std::string user_key_str = user_key.ToString(); + ParsedInternalKey ikey = ParsedInternalKey(user_key_str, seq_no, type); + + // If ts enabled for current cf, there will always be ts appended after each + // piece of value. + if (ts_sz > 0) { + ret = GetLengthPrefixedSlice(&s, &ts); + assert(ts_sz == ts.size()); + assert(ret); + ikey.SetTimestamp(ts); + } + + (void)ret; + + Status read_status; + get_context->SaveValue(ikey, value, &dont_care, &read_status, value_pinner); + if (!read_status.ok()) { + return read_status; + } } + return Status::OK(); } } // namespace ROCKSDB_NAMESPACE diff --git a/table/get_context.h b/table/get_context.h index b43ff6e1600..ada479001c9 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -135,7 +135,8 @@ class GetContext { // Returns True if more keys need to be read (due to merges) or // False if the complete value has been found. bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value, - bool* matched, Cleanable* value_pinner = nullptr); + bool* matched, Status* read_status, + Cleanable* value_pinner = nullptr); // Simplified version of the previous function. Should only be used when we // know that the operation is a Put. @@ -149,6 +150,8 @@ class GetContext { bool NeedTimestamp() { return timestamp_ != nullptr; } + inline size_t TimestampSize() { return ucmp_->timestamp_size(); } + void SetTimestampFromRangeTombstone(const Slice& timestamp) { assert(timestamp_); timestamp_->assign(timestamp.data(), timestamp.size()); @@ -202,7 +205,9 @@ class GetContext { void MergeWithWideColumnBaseValue(const Slice& entity); bool GetBlobValue(const Slice& user_key, const Slice& blob_index, - PinnableSlice* blob_value); + PinnableSlice* blob_value, Status* read_status); + + void appendToReplayLog(ValueType type, Slice value, Slice ts); const Comparator* ucmp_; const MergeOperator* merge_operator_; @@ -246,9 +251,9 @@ class GetContext { // Call this to replay a log and bring the get_context up to date. The replay // log must have been created by another GetContext object, whose replay log // must have been set by calling GetContext::SetReplayLog(). -void replayGetContextLog(const Slice& replay_log, const Slice& user_key, - GetContext* get_context, - Cleanable* value_pinner = nullptr, - SequenceNumber seq_no = kMaxSequenceNumber); +Status replayGetContextLog(const Slice& replay_log, const Slice& user_key, + GetContext* get_context, + Cleanable* value_pinner = nullptr, + SequenceNumber seq_no = kMaxSequenceNumber); } // namespace ROCKSDB_NAMESPACE diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 060306003ce..8ecbb0f90b4 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -116,6 +116,14 @@ class InternalIteratorBase : public Cleanable { // REQUIRES: Valid() virtual Slice key() const = 0; + // Returns the approximate write time of this entry, which is deduced from + // sequence number if sequence number to time mapping is available. + // The default implementation returns maximum uint64_t and that indicates the + // write time is unknown. + virtual uint64_t write_unix_time() const { + return std::numeric_limits::max(); + } + // Return user key for the current entry. // REQUIRES: Valid() virtual Slice user_key() const { return ExtractUserKey(key()); } @@ -220,16 +228,15 @@ using InternalIterator = InternalIteratorBase; // Return an empty iterator (yields nothing). template -extern InternalIteratorBase* NewEmptyInternalIterator(); +InternalIteratorBase* NewEmptyInternalIterator(); // Return an empty iterator with the specified status. template -extern InternalIteratorBase* NewErrorInternalIterator( - const Status& status); +InternalIteratorBase* NewErrorInternalIterator(const Status& status); // Return an empty iterator with the specified status, allocated arena. template -extern InternalIteratorBase* NewErrorInternalIterator( - const Status& status, Arena* arena); +InternalIteratorBase* NewErrorInternalIterator(const Status& status, + Arena* arena); } // namespace ROCKSDB_NAMESPACE diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index a9de3dff35c..b53076910ec 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -82,6 +82,12 @@ class IteratorWrapperBase { assert(Valid()); return result_.key; } + + uint64_t write_unix_time() const { + assert(Valid()); + return iter_->write_unix_time(); + } + TValue value() const { assert(Valid()); return iter_->value(); @@ -101,6 +107,7 @@ class IteratorWrapperBase { } if (iter_->PrepareValue()) { result_.value_prepared = true; + result_.key = iter_->key(); return true; } @@ -213,6 +220,6 @@ using IteratorWrapper = IteratorWrapperBase; class Arena; // Return an empty iterator (yields nothing) allocated from arena. template -extern InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); +InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); } // namespace ROCKSDB_NAMESPACE diff --git a/table/merger_test.cc b/table/merger_test.cc index 71dc798e57b..29e433c28ec 100644 --- a/table/merger_test.cc +++ b/table/merger_test.cc @@ -107,7 +107,7 @@ class MergerTest : public testing::Test { } merging_iterator_.reset( - NewMergingIterator(&icomp_, &small_iterators[0], + NewMergingIterator(&icomp_, small_iterators.data(), static_cast(small_iterators.size()))); single_iterator_.reset(new VectorIterator(all_keys_, all_keys_, &icomp_)); } diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index 247564fe7b0..833c6123eee 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -430,6 +430,11 @@ class MergingIterator : public InternalIterator { return current_->key(); } + uint64_t write_unix_time() const override { + assert(Valid()); + return current_->write_unix_time(); + } + Slice value() const override { assert(Valid()); return current_->value(); diff --git a/table/merging_iterator.h b/table/merging_iterator.h index 562a4e57f50..66351bcc39a 100644 --- a/table/merging_iterator.h +++ b/table/merging_iterator.h @@ -32,9 +32,10 @@ using InternalIterator = InternalIteratorBase; // key is present in K child iterators, it will be yielded K times. // // REQUIRES: n >= 0 -extern InternalIterator* NewMergingIterator( - const InternalKeyComparator* comparator, InternalIterator** children, int n, - Arena* arena = nullptr, bool prefix_seek_mode = false); +InternalIterator* NewMergingIterator(const InternalKeyComparator* comparator, + InternalIterator** children, int n, + Arena* arena = nullptr, + bool prefix_seek_mode = false); // The iterator returned by NewMergingIterator() and // MergeIteratorBuilder::Finish(). MergingIterator handles the merging of data diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 2cbaacec08f..55f5935b11c 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -186,7 +186,7 @@ void LogPropertiesCollectionError(Logger* info_log, const std::string& method, bool NotifyCollectTableCollectorsOnAdd( const Slice& key, const Slice& value, uint64_t file_size, - const std::vector>& collectors, + const std::vector>& collectors, Logger* info_log) { bool all_succeeded = true; for (auto& collector : collectors) { @@ -201,7 +201,7 @@ bool NotifyCollectTableCollectorsOnAdd( } void NotifyCollectTableCollectorsOnBlockAdd( - const std::vector>& collectors, + const std::vector>& collectors, const uint64_t block_uncomp_bytes, const uint64_t block_compressed_bytes_fast, const uint64_t block_compressed_bytes_slow) { @@ -212,7 +212,7 @@ void NotifyCollectTableCollectorsOnBlockAdd( } bool NotifyCollectTableCollectorsOnFinish( - const std::vector>& collectors, + const std::vector>& collectors, Logger* info_log, PropertyBlockBuilder* builder, UserCollectedProperties& user_collected_properties, UserCollectedProperties& readable_properties) { diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 0a404dc9cf5..3d1edb5018b 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -88,11 +88,11 @@ void LogPropertiesCollectionError(Logger* info_log, const std::string& method, // property collectors. bool NotifyCollectTableCollectorsOnAdd( const Slice& key, const Slice& value, uint64_t file_size, - const std::vector>& collectors, + const std::vector>& collectors, Logger* info_log); void NotifyCollectTableCollectorsOnBlockAdd( - const std::vector>& collectors, + const std::vector>& collectors, uint64_t block_uncomp_bytes, uint64_t block_compressed_bytes_fast, uint64_t block_compressed_bytes_slow); @@ -101,7 +101,7 @@ void NotifyCollectTableCollectorsOnBlockAdd( // It will also populate `user_collected_properties` and `readable_properties` // with the collected properties. bool NotifyCollectTableCollectorsOnFinish( - const std::vector>& collectors, + const std::vector>& collectors, Logger* info_log, PropertyBlockBuilder* builder, UserCollectedProperties& user_collected_properties, UserCollectedProperties& readable_properties); diff --git a/table/mock_table.cc b/table/mock_table.cc index 1823758e446..14fbb3f1d07 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -13,8 +13,7 @@ #include "table/get_context.h" #include "util/coding.h" -namespace ROCKSDB_NAMESPACE { -namespace mock { +namespace ROCKSDB_NAMESPACE::mock { KVVector MakeMockFile(std::initializer_list l) { return KVVector(l); } @@ -59,7 +58,7 @@ class MockTableReader : public TableReader { std::shared_ptr GetTableProperties() const override; - ~MockTableReader() {} + ~MockTableReader() = default; private: const KVVector& table_; @@ -134,7 +133,7 @@ class MockTableBuilder : public TableBuilder { } // REQUIRES: Either Finish() or Abandon() has been called. - ~MockTableBuilder() {} + ~MockTableBuilder() = default; // Add key,value to the table being constructed. // REQUIRES: key is after any previously added key according to comparator. @@ -221,7 +220,13 @@ Status MockTableReader::Get(const ReadOptions&, const Slice& key, } bool dont_care __attribute__((__unused__)); - if (!get_context->SaveValue(parsed_key, iter->value(), &dont_care)) { + Status read_status; + bool ret = get_context->SaveValue(parsed_key, iter->value(), &dont_care, + &read_status); + if (!read_status.ok()) { + return read_status; + } + if (!ret) { break; } } @@ -298,7 +303,7 @@ Status MockTableFactory::GetAndWriteNextID(WritableFileWriter* file, *next_id = next_id_.fetch_add(1); char buf[4]; EncodeFixed32(buf, *next_id); - return file->Append(Slice(buf, 4)); + return file->Append(IOOptions(), Slice(buf, 4)); } Status MockTableFactory::GetIDFromFile(RandomAccessFileReader* file, @@ -347,5 +352,4 @@ void MockTableFactory::AssertLatestFiles( } } -} // namespace mock -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::mock diff --git a/table/mock_table.h b/table/mock_table.h index e4850d06062..737360c2383 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -67,9 +67,7 @@ class MockTableFactory : public TableFactory { Status CreateMockTable(Env* env, const std::string& fname, KVVector file_contents); - virtual std::string GetPrintableOptions() const override { - return std::string(); - } + std::string GetPrintableOptions() const override { return std::string(); } void SetCorruptionMode(MockCorruptionMode mode) { corrupt_mode_ = mode; } diff --git a/table/persistent_cache_helper.cc b/table/persistent_cache_helper.cc index eece8100e6c..0d61c1031d9 100644 --- a/table/persistent_cache_helper.cc +++ b/table/persistent_cache_helper.cc @@ -40,7 +40,6 @@ void PersistentCacheHelper::InsertUncompressed( cache_options.persistent_cache ->Insert(key.AsSlice(), contents.data.data(), contents.data.size()) .PermitUncheckedError(); - ; } Status PersistentCacheHelper::LookupSerialized( diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc index 24dd0f97ae4..f0443bd9450 100644 --- a/table/plain/plain_table_builder.cc +++ b/table/plain/plain_table_builder.cc @@ -5,8 +5,7 @@ #include "table/plain/plain_table_builder.h" -#include - +#include #include #include #include @@ -40,7 +39,7 @@ IOStatus WriteBlock(const Slice& block_contents, WritableFileWriter* file, uint64_t* offset, BlockHandle* block_handle) { block_handle->set_offset(*offset); block_handle->set_size(block_contents.size()); - IOStatus io_s = file->Append(block_contents); + IOStatus io_s = file->Append(IOOptions(), block_contents); if (io_s.ok()) { *offset += block_contents.size(); @@ -53,12 +52,12 @@ IOStatus WriteBlock(const Slice& block_contents, WritableFileWriter* file, // kPlainTableMagicNumber was picked by running // echo rocksdb.table.plain | sha1sum // and taking the leading 64 bits. -extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull; -extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; +const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull; +const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; PlainTableBuilder::PlainTableBuilder( const ImmutableOptions& ioptions, const MutableCFOptions& moptions, - const IntTblPropCollectorFactories* int_tbl_prop_collector_factories, + const InternalTblPropCollFactories* internal_tbl_prop_coll_factories, uint32_t column_family_id, int level_at_creation, WritableFileWriter* file, uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, const std::string& column_family_name, @@ -115,13 +114,16 @@ PlainTableBuilder::PlainTableBuilder( properties_ .user_collected_properties[PlainTablePropertyNames::kEncodingType] = val; - assert(int_tbl_prop_collector_factories); - for (auto& factory : *int_tbl_prop_collector_factories) { + assert(internal_tbl_prop_coll_factories); + for (auto& factory : *internal_tbl_prop_coll_factories) { assert(factory); - table_properties_collectors_.emplace_back( - factory->CreateIntTblPropCollector(column_family_id, - level_at_creation)); + std::unique_ptr collector{ + factory->CreateInternalTblPropColl(column_family_id, + level_at_creation)}; + if (collector) { + table_properties_collectors_.emplace_back(std::move(collector)); + } } } @@ -136,6 +138,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { // temp buffer for metadata bytes between key and value. char meta_bytes_buf[6]; size_t meta_bytes_buf_size = 0; + const IOOptions opts; ParsedInternalKey internal_key; if (!ParseInternalKey(key, &internal_key, false /* log_err_key */) @@ -176,12 +179,13 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size); assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf)); meta_bytes_buf_size = end_ptr - meta_bytes_buf; - io_status_ = file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size)); + io_status_ = + file_->Append(opts, Slice(meta_bytes_buf, meta_bytes_buf_size)); } // Write value if (io_status_.ok()) { - io_status_ = file_->Append(value); + io_status_ = file_->Append(opts, value); offset_ += value_size + meta_bytes_buf_size; } @@ -304,7 +308,7 @@ Status PlainTableBuilder::Finish() { status_ = s; return status_; } - io_status_ = file_->Append(footer.GetSlice()); + io_status_ = file_->Append(IOOptions(), footer.GetSlice()); if (io_status_.ok()) { offset_ += footer.GetSlice().size(); } @@ -335,10 +339,10 @@ const char* PlainTableBuilder::GetFileChecksumFuncName() const { return kUnknownFileChecksumFuncName; } } -void PlainTableBuilder::SetSeqnoTimeTableProperties(const std::string& string, - uint64_t uint_64) { +void PlainTableBuilder::SetSeqnoTimeTableProperties( + const SeqnoToTimeMapping& relevant_mapping, uint64_t uint_64) { // TODO: storing seqno to time mapping is not yet support for plain table. - TableBuilder::SetSeqnoTimeTableProperties(string, uint_64); + TableBuilder::SetSeqnoTimeTableProperties(relevant_mapping, uint_64); } } // namespace ROCKSDB_NAMESPACE diff --git a/table/plain/plain_table_builder.h b/table/plain/plain_table_builder.h index fb7ea63be50..30e9558628a 100644 --- a/table/plain/plain_table_builder.h +++ b/table/plain/plain_table_builder.h @@ -39,7 +39,7 @@ class PlainTableBuilder : public TableBuilder { // that the caller does not know which level the output file will reside. PlainTableBuilder( const ImmutableOptions& ioptions, const MutableCFOptions& moptions, - const IntTblPropCollectorFactories* int_tbl_prop_collector_factories, + const InternalTblPropCollFactories* internal_tbl_prop_coll_factories, uint32_t column_family_id, int level_at_creation, WritableFileWriter* file, uint32_t user_key_size, EncodingType encoding_type, size_t index_sparseness, @@ -95,14 +95,14 @@ class PlainTableBuilder : public TableBuilder { // Get file checksum function name const char* GetFileChecksumFuncName() const override; - void SetSeqnoTimeTableProperties(const std::string& string, + void SetSeqnoTimeTableProperties(const SeqnoToTimeMapping& relevant_mapping, uint64_t uint_64) override; private: Arena arena_; const ImmutableOptions& ioptions_; const MutableCFOptions& moptions_; - std::vector> + std::vector> table_properties_collectors_; BloomBlockBuilder bloom_block_; diff --git a/table/plain/plain_table_factory.cc b/table/plain/plain_table_factory.cc index 80aa9cb8e8a..7d01b07f303 100644 --- a/table/plain/plain_table_factory.cc +++ b/table/plain/plain_table_factory.cc @@ -5,8 +5,7 @@ #include "table/plain/plain_table_factory.h" -#include - +#include #include #include "db/dbformat.h" @@ -78,7 +77,7 @@ TableBuilder* PlainTableFactory::NewTableBuilder( // return new PlainTableBuilder( table_builder_options.ioptions, table_builder_options.moptions, - table_builder_options.int_tbl_prop_collector_factories, + table_builder_options.internal_tbl_prop_coll_factories, table_builder_options.column_family_id, table_builder_options.level_at_creation, file, table_options_.user_key_len, table_options_.encoding_type, @@ -157,7 +156,7 @@ static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library, AsPattern(VectorRepFactory::kClassName(), VectorRepFactory::kNickName()), [](const std::string& uri, std::unique_ptr* guard, std::string* /*errmsg*/) { - auto colon = uri.find(":"); + auto colon = uri.find(':'); if (colon != std::string::npos) { size_t count = ParseSizeT(uri.substr(colon + 1)); guard->reset(new VectorRepFactory(count)); @@ -170,7 +169,7 @@ static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library, AsPattern(SkipListFactory::kClassName(), SkipListFactory::kNickName()), [](const std::string& uri, std::unique_ptr* guard, std::string* /*errmsg*/) { - auto colon = uri.find(":"); + auto colon = uri.find(':'); if (colon != std::string::npos) { size_t lookahead = ParseSizeT(uri.substr(colon + 1)); guard->reset(new SkipListFactory(lookahead)); @@ -184,7 +183,7 @@ static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library, [](const std::string& uri, std::unique_ptr* guard, std::string* /*errmsg*/) { // Expecting format: hash_linkedlist: - auto colon = uri.find(":"); + auto colon = uri.find(':'); if (colon != std::string::npos) { size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1)); guard->reset(NewHashLinkListRepFactory(hash_bucket_count)); @@ -198,7 +197,7 @@ static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library, [](const std::string& uri, std::unique_ptr* guard, std::string* /*errmsg*/) { // Expecting format: prefix_hash: - auto colon = uri.find(":"); + auto colon = uri.find(':'); if (colon != std::string::npos) { size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1)); guard->reset(NewHashSkipListRepFactory(hash_bucket_count)); @@ -280,7 +279,7 @@ Status GetPlainTableOptionsFromMap( return s; } -extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) { +TableFactory* NewPlainTableFactory(const PlainTableOptions& options) { return new PlainTableFactory(options); } diff --git a/table/plain/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc index 0ac42319103..102a16a6b3b 100644 --- a/table/plain/plain_table_key_coding.cc +++ b/table/plain/plain_table_key_coding.cc @@ -94,6 +94,8 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, Slice key_to_write = key; // Portion of internal key to write out. uint32_t user_key_size = static_cast(key.size() - 8); + const IOOptions opts; + if (encoding_type_ == kPlain) { if (fixed_user_key_len_ == kPlainTableVariableLength) { // Write key length @@ -101,7 +103,7 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, char* ptr = EncodeVarint32(key_size_buf, user_key_size); assert(ptr <= key_size_buf + sizeof(key_size_buf)); auto len = ptr - key_size_buf; - IOStatus io_s = file->Append(Slice(key_size_buf, len)); + IOStatus io_s = file->Append(opts, Slice(key_size_buf, len)); if (!io_s.ok()) { return io_s; } @@ -119,7 +121,7 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, key_count_for_prefix_ = 1; pre_prefix_.SetUserKey(prefix); size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes); - IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos)); + IOStatus io_s = file->Append(opts, Slice(size_bytes, size_bytes_pos)); if (!io_s.ok()) { return io_s; } @@ -137,7 +139,7 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, static_cast(pre_prefix_.GetUserKey().size()); size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len, size_bytes + size_bytes_pos); - IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos)); + IOStatus io_s = file->Append(opts, Slice(size_bytes, size_bytes_pos)); if (!io_s.ok()) { return io_s; } @@ -152,7 +154,7 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, // in this buffer to safe one file append call, which takes 1 byte. if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) { IOStatus io_s = - file->Append(Slice(key_to_write.data(), key_to_write.size() - 8)); + file->Append(opts, Slice(key_to_write.data(), key_to_write.size() - 8)); if (!io_s.ok()) { return io_s; } @@ -160,7 +162,7 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0; *meta_bytes_buf_size += 1; } else { - IOStatus io_s = file->Append(key_to_write); + IOStatus io_s = file->Append(opts, key_to_write); if (!io_s.ok()) { return io_s; } diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index a74da1f8952..d3c968f73a9 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -87,7 +87,6 @@ class PlainTableIterator : public InternalIterator { Status status_; }; -extern const uint64_t kPlainTableMagicNumber; PlainTableReader::PlainTableReader( const ImmutableOptions& ioptions, std::unique_ptr&& file, @@ -126,7 +125,7 @@ Status PlainTableReader::Open( } std::unique_ptr props; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, ioptions, read_options, &props); @@ -300,7 +299,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, BlockContents index_block_contents; - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, @@ -454,7 +453,9 @@ Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder, ParsedInternalKey parsed_target; Status s = ParseInternalKey(target, &parsed_target, false /* log_err_key */); // TODO - if (!s.ok()) return s; + if (!s.ok()) { + return s; + } // The key is between [low, high). Do a binary search between it. while (high - low > 1) { @@ -591,7 +592,9 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, ParsedInternalKey parsed_target; s = ParseInternalKey(target, &parsed_target, false /* log_err_key */); // TODO - if (!s.ok()) return s; + if (!s.ok()) { + return s; + } Slice found_value; while (offset < file_info_.data_end_offset) { @@ -611,8 +614,12 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, // can we enable the fast path? if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { bool dont_care __attribute__((__unused__)); - if (!get_context->SaveValue(found_key, found_value, &dont_care, - dummy_cleanable_.get())) { + bool ret = get_context->SaveValue(found_key, found_value, &dont_care, &s, + dummy_cleanable_.get()); + if (!s.ok()) { + return s; + } + if (!ret) { break; } } @@ -642,7 +649,7 @@ PlainTableIterator::PlainTableIterator(PlainTableReader* table, next_offset_ = offset_ = table_->file_info_.data_end_offset; } -PlainTableIterator::~PlainTableIterator() {} +PlainTableIterator::~PlainTableIterator() = default; bool PlainTableIterator::Valid() const { return offset_ < table_->file_info_.data_end_offset && diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h index 0f5f7f3ce0e..b127e22c0f4 100644 --- a/table/plain/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -106,7 +106,7 @@ class PlainTableReader : public TableReader { return table_properties_; } - virtual size_t ApproximateMemoryUsage() const override { + size_t ApproximateMemoryUsage() const override { return arena_.MemoryAllocatedBytes(); } diff --git a/table/scoped_arena_iterator.h b/table/scoped_arena_iterator.h deleted file mode 100644 index 2b8824d95e4..00000000000 --- a/table/scoped_arena_iterator.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -#pragma once - -#include "port/port.h" -#include "table/internal_iterator.h" - -namespace ROCKSDB_NAMESPACE { -class ScopedArenaIterator { - void reset(InternalIterator* iter) noexcept { - if (iter_ != nullptr) { - iter_->~InternalIterator(); - } - iter_ = iter; - } - - public: - explicit ScopedArenaIterator(InternalIterator* iter = nullptr) - : iter_(iter) {} - - ScopedArenaIterator(const ScopedArenaIterator&) = delete; - ScopedArenaIterator& operator=(const ScopedArenaIterator&) = delete; - - ScopedArenaIterator(ScopedArenaIterator&& o) noexcept { - iter_ = o.iter_; - o.iter_ = nullptr; - } - - ScopedArenaIterator& operator=(ScopedArenaIterator&& o) noexcept { - reset(o.iter_); - o.iter_ = nullptr; - return *this; - } - - InternalIterator* operator->() { return iter_; } - InternalIterator* get() { return iter_; } - - void set(InternalIterator* iter) { reset(iter); } - - InternalIterator* release() { - assert(iter_ != nullptr); - auto* res = iter_; - iter_ = nullptr; - return res; - } - - ~ScopedArenaIterator() { reset(nullptr); } - - private: - InternalIterator* iter_; -}; -} // namespace ROCKSDB_NAMESPACE diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc index 150776de1b2..d201163808d 100644 --- a/table/sst_file_dumper.cc +++ b/table/sst_file_dumper.cc @@ -58,6 +58,7 @@ SstFileDumper::SstFileDumper(const Options& options, options_(options), ioptions_(options_), moptions_(ColumnFamilyOptions(options_)), + // TODO: plumb Env::IOActivity, Env::IOPriority read_options_(verify_checksum, false), internal_comparator_(BytewiseComparator()) { read_options_.readahead_size = readahead_size; @@ -67,11 +68,6 @@ SstFileDumper::SstFileDumper(const Options& options, init_result_ = GetTableReader(file_name_); } -extern const uint64_t kBlockBasedTableMagicNumber; -extern const uint64_t kLegacyBlockBasedTableMagicNumber; -extern const uint64_t kPlainTableMagicNumber; -extern const uint64_t kLegacyPlainTableMagicNumber; - const char* testFileName = "test_file_name"; Status SstFileDumper::GetTableReader(const std::string& file_path) { @@ -90,20 +86,21 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) { fopts.temperature = file_temp_; Status s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr); if (s.ok()) { + // check empty file + // if true, skip further processing of this file s = fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr); - } - - // check empty file - // if true, skip further processing of this file - if (file_size == 0) { - return Status::Aborted(file_path, "Empty file"); + if (s.ok()) { + if (file_size == 0) { + return Status::Aborted(file_path, "Empty file"); + } + } } file_.reset(new RandomAccessFileReader(std::move(file), file_path)); - FilePrefetchBuffer prefetch_buffer( - 0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */, - false /* track_min_offset */); + FilePrefetchBuffer prefetch_buffer(ReadaheadParams(), + !fopts.use_mmap_reads /* enable */, + false /* track_min_offset */); if (s.ok()) { const uint64_t kSstDumpTailPrefetchSize = 512 * 1024; uint64_t prefetch_size = (file_size > kSstDumpTailPrefetchSize) @@ -123,8 +120,15 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) { if (s.ok()) { if (magic_number == kPlainTableMagicNumber || - magic_number == kLegacyPlainTableMagicNumber) { + magic_number == kLegacyPlainTableMagicNumber || + magic_number == kCuckooTableMagicNumber) { soptions_.use_mmap_reads = true; + fopts.use_mmap_reads = soptions_.use_mmap_reads; + + if (magic_number == kCuckooTableMagicNumber) { + fopts = soptions_; + fopts.temperature = file_temp_; + } fs->NewRandomAccessFile(file_path, fopts, &file, nullptr); file_.reset(new RandomAccessFileReader(std::move(file), file_path)); @@ -167,8 +171,6 @@ Status SstFileDumper::NewTableReader( const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/, const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size, std::unique_ptr* /*table_reader*/) { - // TODO(yuzhangyu): full support in sst_dump for SST files generated when - // `user_defined_timestamps_persisted` is false. auto t_opt = TableReaderOptions( ioptions_, moptions_.prefix_extractor, soptions_, internal_comparator_, 0 /* block_protection_bytes_per_key */, false /* skip_filters */, @@ -296,14 +298,18 @@ Status SstFileDumper::ShowCompressionSize( const ImmutableOptions imoptions(opts); const ColumnFamilyOptions cfo(opts); const MutableCFOptions moptions(cfo); + // TODO: plumb Env::IOActivity, Env::IOPriority + const ReadOptions read_options; + const WriteOptions write_options; ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator); - IntTblPropCollectorFactories block_based_table_factories; + InternalTblPropCollFactories block_based_table_factories; std::string column_family_name; int unknown_level = -1; + TableBuilderOptions tb_opts( - imoptions, moptions, ikc, &block_based_table_factories, compress_type, - compress_opt, + imoptions, moptions, read_options, write_options, ikc, + &block_based_table_factories, compress_type, compress_opt, TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, column_family_name, unknown_level); uint64_t num_data_blocks = 0; @@ -368,10 +374,8 @@ Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number, RandomAccessFileReader* file, uint64_t file_size, FilePrefetchBuffer* prefetch_buffer) { - // TODO: plumb Env::IOActivity - const ReadOptions read_options; Status s = ROCKSDB_NAMESPACE::ReadTableProperties( - file, file_size, table_magic_number, ioptions_, read_options, + file, file_size, table_magic_number, ioptions_, read_options_, &table_properties_, /* memory_allocator= */ nullptr, prefetch_buffer); if (!s.ok()) { @@ -426,6 +430,13 @@ Status SstFileDumper::SetTableOptionsByMagicNumber( if (!silent_) { fprintf(stdout, "Sst file format: plain table\n"); } + } else if (table_magic_number == kCuckooTableMagicNumber) { + ioptions_.allow_mmap_reads = true; + + options_.table_factory.reset(NewCuckooTableFactory()); + if (!silent_) { + fprintf(stdout, "Sst file format: cuckoo table\n"); + } } else { char error_msg_buffer[80]; snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1, @@ -447,7 +458,7 @@ Status SstFileDumper::SetOldTableOptions() { return Status::OK(); } -Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, +Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num_limit, bool has_from, const std::string& from_key, bool has_to, const std::string& to_key, bool use_from_as_prefix) { @@ -481,7 +492,9 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, Slice key = iter->key(); Slice value = iter->value(); ++i; - if (read_num > 0 && i > read_num) break; + if (read_num_limit > 0 && i > read_num_limit) { + break; + } ParsedInternalKey ikey; Status pik_status = ParseInternalKey(key, &ikey, true /* log_err_key */); @@ -514,6 +527,13 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, fprintf(stdout, "%s => %s\n", ikey.DebugString(true, output_hex_).c_str(), oss.str().c_str()); + } else if (ikey.type == kTypeValuePreferredSeqno) { + auto [unpacked_value, preferred_seqno] = + ParsePackedValueWithSeqno(value); + fprintf(stdout, "%s => %s, %llu\n", + ikey.DebugString(true, output_hex_).c_str(), + unpacked_value.ToString(output_hex_).c_str(), + static_cast(preferred_seqno)); } else { fprintf(stdout, "%s => %s\n", ikey.DebugString(true, output_hex_).c_str(), @@ -539,6 +559,31 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, read_num_ += i; Status ret = iter->status(); + + bool verify_num_entries = + (read_num_limit == 0 || + read_num_limit == std::numeric_limits::max()) && + !has_from && !has_to; + if (verify_num_entries && ret.ok()) { + // Compare the number of entries + if (!table_properties_) { + fprintf(stderr, "Table properties not available."); + } else { + // TODO: verify num_range_deletions + if (i != table_properties_->num_entries - + table_properties_->num_range_deletions) { + std::ostringstream oss; + oss << "Table property expects " + << table_properties_->num_entries - + table_properties_->num_range_deletions + << " entries when excluding range deletions," + << " but scanning the table returned " << std::to_string(i) + << " entries"; + ret = Status::Corruption(oss.str()); + } + } + } + delete iter; return ret; } diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h index 1e78959d145..893d98bb3de 100644 --- a/table/sst_file_dumper.h +++ b/table/sst_file_dumper.h @@ -23,7 +23,12 @@ class SstFileDumper { const EnvOptions& soptions = EnvOptions(), bool silent = false); - Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from, + // read_num_limit limits the total number of keys read. If read_num_limit = 0, + // then there is no limit. If read_num_limit = 0 or + // std::numeric_limits::max(), has_from and has_to are false, then + // the number of keys read is compared with `num_entries` field in table + // properties. A Corruption status is returned if they do not match. + Status ReadSequential(bool print_kv, uint64_t read_num_limit, bool has_from, const std::string& from_key, bool has_to, const std::string& to_key, bool use_from_as_prefix = false); @@ -88,7 +93,7 @@ class SstFileDumper { std::unique_ptr table_reader_; std::unique_ptr file_; - const ImmutableOptions ioptions_; + ImmutableOptions ioptions_; const MutableCFOptions moptions_; ReadOptions read_options_; InternalKeyComparator internal_comparator_; diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc index 533b7cd6ac7..da50ff037c8 100644 --- a/table/sst_file_reader.cc +++ b/table/sst_file_reader.cc @@ -36,7 +36,7 @@ struct SstFileReader::Rep { SstFileReader::SstFileReader(const Options& options) : rep_(new Rep(options)) {} -SstFileReader::~SstFileReader() {} +SstFileReader::~SstFileReader() = default; Status SstFileReader::Open(const std::string& file_path) { auto r = rep_.get(); @@ -55,9 +55,17 @@ Status SstFileReader::Open(const std::string& file_path) { file_reader.reset(new RandomAccessFileReader(std::move(file), file_path)); } if (s.ok()) { - TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor, - r->soptions, r->ioptions.internal_comparator, - r->moptions.block_protection_bytes_per_key); + TableReaderOptions t_opt( + r->ioptions, r->moptions.prefix_extractor, r->soptions, + r->ioptions.internal_comparator, + r->moptions.block_protection_bytes_per_key, + /*skip_filters*/ false, /*immortal*/ false, + /*force_direct_prefetch*/ false, /*level*/ -1, + /*block_cache_tracer*/ nullptr, + /*max_file_size_for_l0_meta_pin*/ 0, /*cur_db_session_id*/ "", + /*cur_file_num*/ 0, + /* unique_id */ {}, /* largest_seqno */ 0, + /* tail_size */ 0, r->ioptions.persist_user_defined_timestamps); // Allow open file with global sequence number for backward compatibility. t_opt.largest_seqno = kMaxSequenceNumber; s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader), @@ -73,12 +81,11 @@ Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) { ? roptions.snapshot->GetSequenceNumber() : kMaxSequenceNumber; ArenaWrappedDBIter* res = new ArenaWrappedDBIter(); - res->Init(r->options.env, roptions, r->ioptions, r->moptions, - nullptr /* version */, sequence, - r->moptions.max_sequential_skip_in_iterations, - 0 /* version_number */, nullptr /* read_callback */, - nullptr /* db_impl */, nullptr /* cfd */, - true /* expose_blob_index */, false /* allow_refresh */); + res->Init( + r->options.env, roptions, r->ioptions, r->moptions, nullptr /* version */, + sequence, r->moptions.max_sequential_skip_in_iterations, + 0 /* version_number */, nullptr /* read_callback */, nullptr /* cfh */, + true /* expose_blob_index */, false /* allow_refresh */); auto internal_iter = r->table_reader->NewIterator( res->GetReadOptions(), r->moptions.prefix_extractor.get(), res->GetArena(), false /* skip_filters */, @@ -98,4 +105,40 @@ Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) { TableReaderCaller::kSSTFileReader); } +Status SstFileReader::VerifyNumEntries(const ReadOptions& read_options) { + Rep* r = rep_.get(); + std::unique_ptr internal_iter{r->table_reader->NewIterator( + read_options, r->moptions.prefix_extractor.get(), nullptr, + false /* skip_filters */, TableReaderCaller::kSSTFileReader)}; + internal_iter->SeekToFirst(); + Status s = internal_iter->status(); + if (!s.ok()) { + return s; + } + uint64_t num_read = 0; + for (; internal_iter->Valid(); internal_iter->Next()) { + ++num_read; + }; + s = internal_iter->status(); + if (!s.ok()) { + return s; + } + std::shared_ptr tp = GetTableProperties(); + if (!tp) { + s = Status::Corruption("table properties not available"); + } else { + // TODO: verify num_range_deletions + uint64_t expected = tp->num_entries - tp->num_range_deletions; + if (num_read != expected) { + std::ostringstream oss; + oss << "Table property expects " << expected + << " entries when excluding range deletions," + << " but scanning the table returned " << std::to_string(num_read) + << " entries"; + s = Status::Corruption(oss.str()); + } + } + return s; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc index 36a7975cfcd..597909925a9 100644 --- a/table/sst_file_reader_test.cc +++ b/table/sst_file_reader_test.cc @@ -248,7 +248,8 @@ class SstFileReaderTimestampTest : public testing::Test { : KeyValueDesc(std::move(k), std::string(ts), std::string(v)) {} }; - void CreateFile(const std::vector& descs) { + void CreateFile(const std::vector& descs, + ExternalSstFileInfo* file_info = nullptr) { SstFileWriter writer(soptions_, options_); ASSERT_OK(writer.Open(sst_name_)); @@ -276,7 +277,7 @@ class SstFileReaderTimestampTest : public testing::Test { } } - ASSERT_OK(writer.Finish()); + ASSERT_OK(writer.Finish(file_info)); } void CheckFile(const std::string& timestamp, @@ -413,6 +414,170 @@ TEST_F(SstFileReaderTimestampTest, TimestampSizeMismatch) { "end_key_with_a_complete_lack_of_timestamps")); } +class SstFileReaderTimestampNotPersistedTest + : public SstFileReaderTimestampTest { + public: + SstFileReaderTimestampNotPersistedTest() { + Env* env = Env::Default(); + EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env, &env_guard_)); + EXPECT_NE(nullptr, env); + + options_.env = env; + + options_.comparator = test::BytewiseComparatorWithU64TsWrapper(); + + options_.persist_user_defined_timestamps = false; + + sst_name_ = test::PerThreadDBPath("sst_file_ts_not_persisted"); + } + + ~SstFileReaderTimestampNotPersistedTest() = default; +}; + +TEST_F(SstFileReaderTimestampNotPersistedTest, Basic) { + std::vector input_descs; + + for (uint64_t k = 0; k < kNumKeys; k++) { + input_descs.emplace_back( + /* key */ EncodeAsString(k), /* timestamp */ EncodeAsUint64(0), + /* value */ EncodeAsString(k), /* is_delete */ false, + /* use_contiguous_buffer */ (k % 2) == 0); + } + + ExternalSstFileInfo external_sst_file_info; + + CreateFile(input_descs, &external_sst_file_info); + std::vector output_descs; + + for (uint64_t k = 0; k < kNumKeys; k++) { + output_descs.emplace_back(/* key */ EncodeAsString(k), + /* timestamp */ EncodeAsUint64(0), + /* value */ EncodeAsString(k)); + } + CheckFile(EncodeAsUint64(0), output_descs); + ASSERT_EQ(external_sst_file_info.smallest_key, EncodeAsString(0)); + ASSERT_EQ(external_sst_file_info.largest_key, EncodeAsString(kNumKeys - 1)); + ASSERT_EQ(external_sst_file_info.smallest_range_del_key, ""); + ASSERT_EQ(external_sst_file_info.largest_range_del_key, ""); +} + +TEST_F(SstFileReaderTimestampNotPersistedTest, NonMinTimestampNotAllowed) { + SstFileWriter writer(soptions_, options_); + + ASSERT_OK(writer.Open(sst_name_)); + + ASSERT_NOK(writer.Delete("baz", EncodeAsUint64(2))); + ASSERT_OK(writer.Put("baz", EncodeAsUint64(0), "foo_val")); + + ASSERT_NOK(writer.Put("key", EncodeAsUint64(2), "value1")); + ASSERT_OK(writer.Put("key", EncodeAsUint64(0), "value2")); + + // The `SstFileWriter::DeleteRange` API documentation specifies that + // a range deletion tombstone added in the file does NOT delete point + // (Put/Merge/Delete) keys in the same file. While there is no checks in + // `SstFileWriter` to ensure this requirement is met, when such a range + // deletion does exist, it will get over-written by point data in the same + // file after ingestion because they have the same sequence number. + // We allow having a point data entry and having a range deletion entry for + // a key in the same file when timestamps are removed for the same reason. + // After the file is ingested, the range deletion will effectively get + // over-written by the point data since they will have the same sequence + // number and the same user-defined timestamps. + ASSERT_NOK(writer.DeleteRange("bar", "foo", EncodeAsUint64(2))); + ASSERT_OK(writer.DeleteRange("bar", "foo", EncodeAsUint64(0))); + + ExternalSstFileInfo external_sst_file_info; + + ASSERT_OK(writer.Finish(&external_sst_file_info)); + ASSERT_EQ(external_sst_file_info.smallest_key, "baz"); + ASSERT_EQ(external_sst_file_info.largest_key, "key"); + ASSERT_EQ(external_sst_file_info.smallest_range_del_key, "bar"); + ASSERT_EQ(external_sst_file_info.largest_range_del_key, "foo"); +} + +TEST_F(SstFileReaderTimestampNotPersistedTest, KeyWithoutTimestampOutOfOrder) { + SstFileWriter writer(soptions_, options_); + + ASSERT_OK(writer.Open(sst_name_)); + + ASSERT_OK(writer.Put("foo", EncodeAsUint64(0), "value1")); + ASSERT_NOK(writer.Put("bar", EncodeAsUint64(0), "value2")); +} + +TEST_F(SstFileReaderTimestampNotPersistedTest, IncompatibleTimestampFormat) { + SstFileWriter writer(soptions_, options_); + + ASSERT_OK(writer.Open(sst_name_)); + + // Even though in this mode timestamps are not persisted, we require users + // to call the timestamp-aware APIs only. + ASSERT_TRUE(writer.Put("key", "not_an_actual_64_bit_timestamp", "value") + .IsInvalidArgument()); + ASSERT_TRUE(writer.Delete("another_key", "timestamp_of_unexpected_size") + .IsInvalidArgument()); + + ASSERT_TRUE(writer.Put("key_without_timestamp", "value").IsInvalidArgument()); + ASSERT_TRUE(writer.Merge("another_key_missing_a_timestamp", "merge_operand") + .IsInvalidArgument()); + ASSERT_TRUE( + writer.Delete("yet_another_key_still_no_timestamp").IsInvalidArgument()); + ASSERT_TRUE(writer + .DeleteRange("begin_key_timestamp_absent", + "end_key_with_a_complete_lack_of_timestamps") + .IsInvalidArgument()); +} + +TEST_F(SstFileReaderTest, VerifyNumEntriesBasic) { + std::vector keys; + for (uint64_t i = 0; i < kNumKeys; i++) { + keys.emplace_back(EncodeAsUint64(i)); + } + CreateFile(sst_name_, keys); + SstFileReader reader(options_); + ASSERT_OK(reader.Open(sst_name_)); + ASSERT_OK(reader.VerifyNumEntries(ReadOptions())); +} + +TEST_F(SstFileReaderTest, VerifyNumEntriesDeleteRange) { + SstFileWriter writer(soptions_, options_); + ASSERT_OK(writer.Open(sst_name_)); + + for (uint64_t i = 0; i < kNumKeys; i++) { + ASSERT_OK(writer.Put(EncodeAsUint64(i), EncodeAsUint64(i + 1))); + } + ASSERT_OK( + writer.DeleteRange(EncodeAsUint64(0), EncodeAsUint64(kNumKeys / 2))); + ASSERT_OK(writer.Finish()); + SstFileReader reader(options_); + ASSERT_OK(reader.Open(sst_name_)); + ASSERT_OK(reader.VerifyNumEntries(ReadOptions())); +} + +TEST_F(SstFileReaderTest, VerifyNumEntriesCorruption) { + const int num_keys = 99; + const int corrupted_num_keys = num_keys + 2; + SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) { + TableProperties* props = reinterpret_cast(arg); + props->num_entries = corrupted_num_keys; + }); + SyncPoint::GetInstance()->EnableProcessing(); + std::vector keys; + for (uint64_t i = 0; i < num_keys; i++) { + keys.emplace_back(EncodeAsUint64(i)); + } + CreateFile(sst_name_, keys); + SstFileReader reader(options_); + ASSERT_OK(reader.Open(sst_name_)); + Status s = reader.VerifyNumEntries(ReadOptions()); + ASSERT_TRUE(s.IsCorruption()); + std::ostringstream oss; + oss << "Table property expects " << corrupted_num_keys + << " entries when excluding range deletions," + << " but scanning the table returned " << num_keys << " entries"; + ASSERT_TRUE(std::strstr(oss.str().c_str(), s.getState())); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index 3364e1e016e..5627ff58c5b 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -43,7 +43,14 @@ struct SstFileWriter::Rep { skip_filters(_skip_filters), unsafe_add(_unsafe_add), unsafe_disable_sync(_unsafe_disable_sync), - db_session_id(std::move(_db_session_id)) {} + db_session_id(std::move(_db_session_id)), + ts_sz(_user_comparator->timestamp_size()), + strip_timestamp(ts_sz > 0 && + !ioptions.persist_user_defined_timestamps) { + // TODO (hx235): pass in `WriteOptions` instead of `rate_limiter_priority` + // during construction + write_options.rate_limiter_priority = io_priority; + } std::unique_ptr file_writer; std::unique_ptr builder; @@ -51,6 +58,7 @@ struct SstFileWriter::Rep { ImmutableOptions ioptions; MutableCFOptions mutable_cf_options; Env::IOPriority io_priority; + WriteOptions write_options; InternalKeyComparator internal_comparator; ExternalSstFileInfo file_info; InternalKey ikey; @@ -67,13 +75,29 @@ struct SstFileWriter::Rep { bool unsafe_disable_sync; std::string db_session_id; uint64_t next_file_number = 1; + size_t ts_sz; + bool strip_timestamp; Status AddImpl(const Slice& user_key, const Slice& value, ValueType value_type) { if (!builder) { return Status::InvalidArgument("File is not opened"); } + if (!builder->status().ok()) { + return builder->status(); + } + assert(user_key.size() >= ts_sz); + if (strip_timestamp) { + // In this mode, we expect users to always provide a min timestamp. + if (internal_comparator.user_comparator()->CompareTimestamp( + Slice(user_key.data() + user_key.size() - ts_sz, ts_sz), + MinU64Ts()) != 0) { + return Status::InvalidArgument( + "persist_user_defined_timestamps flag is set to false, only " + "minimum timestamp is accepted."); + } + } if (file_info.num_entries == 0) { file_info.smallest_key.assign(user_key.data(), user_key.size()); } else if (!unsafe_add) { @@ -102,7 +126,7 @@ struct SstFileWriter::Rep { file_info.file_size = builder->FileSize(); InvalidatePageCache(false /* closing */).PermitUncheckedError(); - return Status::OK(); + return builder->status(); } Status Add(const Slice& user_key, const Slice& value, ValueType value_type) { @@ -167,6 +191,28 @@ struct SstFileWriter::Rep { return Status::OK(); } + assert(begin_key.size() >= ts_sz); + assert(end_key.size() >= ts_sz); + Slice begin_key_ts = + Slice(begin_key.data() + begin_key.size() - ts_sz, ts_sz); + Slice end_key_ts = Slice(end_key.data() + end_key.size() - ts_sz, ts_sz); + assert(begin_key_ts.compare(end_key_ts) == 0); + if (strip_timestamp) { + // In this mode, we expect users to always provide a min timestamp. + if (internal_comparator.user_comparator()->CompareTimestamp( + begin_key_ts, MinU64Ts()) != 0) { + return Status::InvalidArgument( + "persist_user_defined_timestamps flag is set to false, only " + "minimum timestamp is accepted for start key."); + } + if (internal_comparator.user_comparator()->CompareTimestamp( + end_key_ts, MinU64Ts()) != 0) { + return Status::InvalidArgument( + "persist_user_defined_timestamps flag is set to false, only " + "minimum timestamp is accepted for end key."); + } + } + RangeTombstone tombstone(begin_key, end_key, 0 /* Sequence Number */); if (file_info.num_range_del_entries == 0) { file_info.smallest_range_del_key.assign(tombstone.start_key_.data(), @@ -287,11 +333,12 @@ SstFileWriter::~SstFileWriter() { } } -Status SstFileWriter::Open(const std::string& file_path) { +Status SstFileWriter::Open(const std::string& file_path, Temperature temp) { Rep* r = rep_.get(); Status s; std::unique_ptr sst_file; FileOptions cur_file_opts(r->env_options); + cur_file_opts.temperature = temp; s = r->ioptions.env->GetFileSystem()->NewWritableFile( file_path, cur_file_opts, &sst_file, nullptr); if (!s.ok()) { @@ -319,10 +366,10 @@ Status SstFileWriter::Open(const std::string& file_path) { compression_opts = r->mutable_cf_options.compression_opts; } - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; // SstFileWriter properties collector to add SstFileWriter version. - int_tbl_prop_collector_factories.emplace_back( + internal_tbl_prop_coll_factories.emplace_back( new SstFileWriterPropertiesCollectorFactory(2 /* version */, 0 /* global_seqno*/)); @@ -330,7 +377,7 @@ Status SstFileWriter::Open(const std::string& file_path) { auto user_collector_factories = r->ioptions.table_properties_collector_factories; for (size_t i = 0; i < user_collector_factories.size(); i++) { - int_tbl_prop_collector_factories.emplace_back( + internal_tbl_prop_coll_factories.emplace_back( new UserKeyTablePropertiesCollectorFactory( user_collector_factories[i])); } @@ -349,13 +396,15 @@ Status SstFileWriter::Open(const std::string& file_path) { // TODO: it would be better to set oldest_key_time to be used for getting the // approximate time of ingested keys. + // TODO: plumb Env::IOActivity, Env::IOPriority TableBuilderOptions table_builder_options( - r->ioptions, r->mutable_cf_options, r->internal_comparator, - &int_tbl_prop_collector_factories, compression_type, compression_opts, - cf_id, r->column_family_name, unknown_level, false /* is_bottommost */, - TableFileCreationReason::kMisc, 0 /* oldest_key_time */, - 0 /* file_creation_time */, "SST Writer" /* db_id */, r->db_session_id, - 0 /* target_file_size */, r->next_file_number); + r->ioptions, r->mutable_cf_options, ReadOptions(), r->write_options, + r->internal_comparator, &internal_tbl_prop_coll_factories, + compression_type, compression_opts, cf_id, r->column_family_name, + unknown_level, false /* is_bottommost */, TableFileCreationReason::kMisc, + 0 /* oldest_key_time */, 0 /* file_creation_time */, + "SST Writer" /* db_id */, r->db_session_id, 0 /* target_file_size */, + r->next_file_number); // External SST files used to each get a unique session id. Now for // slightly better uniqueness probability in constructing cache keys, we // assign fake file numbers to each file (into table properties) and keep @@ -367,8 +416,8 @@ Status SstFileWriter::Open(const std::string& file_path) { FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types; r->file_writer.reset(new WritableFileWriter( std::move(sst_file), file_path, r->env_options, r->ioptions.clock, - nullptr /* io_tracer */, nullptr /* stats */, r->ioptions.listeners, - r->ioptions.file_checksum_gen_factory.get(), + nullptr /* io_tracer */, r->ioptions.stats, Histograms::SST_WRITE_MICROS, + r->ioptions.listeners, r->ioptions.file_checksum_gen_factory.get(), tmp_set.Contains(FileType::kTableFile), false)); // TODO(tec) : If table_factory is using compressed block cache, we will @@ -436,13 +485,17 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { Status s = r->builder->Finish(); r->file_info.file_size = r->builder->FileSize(); + IOOptions opts; + if (s.ok()) { + s = WritableFileWriter::PrepareIOOptions(r->write_options, opts); + } if (s.ok()) { if (!r->unsafe_disable_sync) { - s = r->file_writer->Sync(r->ioptions.use_fsync); + s = r->file_writer->Sync(opts, r->ioptions.use_fsync); } + r->InvalidatePageCache(true /* closing */).PermitUncheckedError(); if (s.ok()) { - r->InvalidatePageCache(true /* closing */).PermitUncheckedError(); - s = r->file_writer->Close(); + s = r->file_writer->Close(opts); } } if (s.ok()) { @@ -456,6 +509,30 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { if (file_info != nullptr) { *file_info = r->file_info; + Slice smallest_key = r->file_info.smallest_key; + Slice largest_key = r->file_info.largest_key; + Slice smallest_range_del_key = r->file_info.smallest_range_del_key; + Slice largest_range_del_key = r->file_info.largest_range_del_key; + assert(smallest_key.empty() == largest_key.empty()); + assert(smallest_range_del_key.empty() == largest_range_del_key.empty()); + // Remove user-defined timestamps from external file metadata too when they + // should not be persisted. + if (r->strip_timestamp) { + if (!smallest_key.empty()) { + assert(smallest_key.size() >= r->ts_sz); + assert(largest_key.size() >= r->ts_sz); + file_info->smallest_key.resize(smallest_key.size() - r->ts_sz); + file_info->largest_key.resize(largest_key.size() - r->ts_sz); + } + if (!smallest_range_del_key.empty()) { + assert(smallest_range_del_key.size() >= r->ts_sz); + assert(largest_range_del_key.size() >= r->ts_sz); + file_info->smallest_range_del_key.resize(smallest_range_del_key.size() - + r->ts_sz); + file_info->largest_range_del_key.resize(largest_range_del_key.size() - + r->ts_sz); + } + } } r->builder.reset(); diff --git a/table/sst_file_writer_collectors.h b/table/sst_file_writer_collectors.h index 486315fb5a7..5f421dffb9c 100644 --- a/table/sst_file_writer_collectors.h +++ b/table/sst_file_writer_collectors.h @@ -23,28 +23,28 @@ struct ExternalSstFilePropertyNames { // PropertiesCollector used to add properties specific to tables // generated by SstFileWriter -class SstFileWriterPropertiesCollector : public IntTblPropCollector { +class SstFileWriterPropertiesCollector : public InternalTblPropColl { public: explicit SstFileWriterPropertiesCollector(int32_t version, SequenceNumber global_seqno) : version_(version), global_seqno_(global_seqno) {} - virtual Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/, - uint64_t /*file_size*/) override { + Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/, + uint64_t /*file_size*/) override { // Intentionally left blank. Have no interest in collecting stats for // individual key/value pairs. return Status::OK(); } - virtual void BlockAdd(uint64_t /* block_uncomp_bytes */, - uint64_t /* block_compressed_bytes_fast */, - uint64_t /* block_compressed_bytes_slow */) override { + void BlockAdd(uint64_t /* block_uncomp_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { // Intentionally left blank. No interest in collecting stats for // blocks. return; } - virtual Status Finish(UserCollectedProperties* properties) override { + Status Finish(UserCollectedProperties* properties) override { // File version std::string version_val; PutFixed32(&version_val, static_cast(version_)); @@ -58,11 +58,11 @@ class SstFileWriterPropertiesCollector : public IntTblPropCollector { return Status::OK(); } - virtual const char* Name() const override { + const char* Name() const override { return "SstFileWriterPropertiesCollector"; } - virtual UserCollectedProperties GetReadableProperties() const override { + UserCollectedProperties GetReadableProperties() const override { return {{ExternalSstFilePropertyNames::kVersion, std::to_string(version_)}}; } @@ -72,18 +72,18 @@ class SstFileWriterPropertiesCollector : public IntTblPropCollector { }; class SstFileWriterPropertiesCollectorFactory - : public IntTblPropCollectorFactory { + : public InternalTblPropCollFactory { public: explicit SstFileWriterPropertiesCollectorFactory(int32_t version, SequenceNumber global_seqno) : version_(version), global_seqno_(global_seqno) {} - virtual IntTblPropCollector* CreateIntTblPropCollector( + InternalTblPropColl* CreateInternalTblPropColl( uint32_t /*column_family_id*/, int /* level_at_creation */) override { return new SstFileWriterPropertiesCollector(version_, global_seqno_); } - virtual const char* Name() const override { + const char* Name() const override { return "SstFileWriterPropertiesCollector"; } diff --git a/table/table_builder.h b/table/table_builder.h index d6f0e1a03c9..b2866c25b21 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -102,8 +102,9 @@ struct TableReaderOptions { struct TableBuilderOptions { TableBuilderOptions( const ImmutableOptions& _ioptions, const MutableCFOptions& _moptions, + const ReadOptions& _read_options, const WriteOptions& _write_options, const InternalKeyComparator& _internal_comparator, - const IntTblPropCollectorFactories* _int_tbl_prop_collector_factories, + const InternalTblPropCollFactories* _internal_tbl_prop_coll_factories, CompressionType _compression_type, const CompressionOptions& _compression_opts, uint32_t _column_family_id, const std::string& _column_family_name, int _level, @@ -115,8 +116,10 @@ struct TableBuilderOptions { const uint64_t _target_file_size = 0, const uint64_t _cur_file_num = 0) : ioptions(_ioptions), moptions(_moptions), + read_options(_read_options), + write_options(_write_options), internal_comparator(_internal_comparator), - int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories), + internal_tbl_prop_coll_factories(_internal_tbl_prop_coll_factories), compression_type(_compression_type), compression_opts(_compression_opts), column_family_id(_column_family_id), @@ -133,8 +136,10 @@ struct TableBuilderOptions { const ImmutableOptions& ioptions; const MutableCFOptions& moptions; + const ReadOptions& read_options; + const WriteOptions& write_options; const InternalKeyComparator& internal_comparator; - const IntTblPropCollectorFactories* int_tbl_prop_collector_factories; + const InternalTblPropCollFactories* internal_tbl_prop_coll_factories; const CompressionType compression_type; const CompressionOptions& compression_opts; const uint32_t column_family_id; @@ -223,10 +228,11 @@ class TableBuilder { // Return file checksum function name virtual const char* GetFileChecksumFuncName() const = 0; - // Set the sequence number to time mapping + // Set the sequence number to time mapping. `relevant_mapping` must be in + // enforced state (ready to encode to string). virtual void SetSeqnoTimeTableProperties( - const std::string& /*encoded_seqno_to_time_mapping*/, - uint64_t /*oldest_ancestor_time*/){}; + const SeqnoToTimeMapping& /*relevant_mapping*/, + uint64_t /*oldest_ancestor_time*/){} }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/table_properties.cc b/table/table_properties.cc index 17a13543de8..0a899af37a6 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -163,7 +163,7 @@ std::string TableProperties::ToString(const std::string& prop_delim, kv_delim); SeqnoToTimeMapping seq_time_mapping; - s = seq_time_mapping.Add(seqno_to_time_mapping); + s = seq_time_mapping.DecodeFrom(seqno_to_time_mapping); AppendProperty(result, "Sequence number to time mapping", s.ok() ? seq_time_mapping.ToHumanString() : "N/A", prop_delim, kv_delim); diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index 60c84d7bf09..2e9094bfcb7 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -95,14 +95,16 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, FileOptions(env_options), &file_writer, nullptr)); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; int unknown_level = -1; + const WriteOptions write_options; tb = opts.table_factory->NewTableBuilder( - TableBuilderOptions( - ioptions, moptions, ikc, &int_tbl_prop_collector_factories, - CompressionType::kNoCompression, CompressionOptions(), - 0 /* column_family_id */, kDefaultColumnFamilyName, unknown_level), + TableBuilderOptions(ioptions, moptions, read_options, write_options, + ikc, &internal_tbl_prop_coll_factories, + CompressionType::kNoCompression, + CompressionOptions(), 0 /* column_family_id */, + kDefaultColumnFamilyName, unknown_level), file_writer.get()); } else { s = DB::Open(opts, dbname, &db); @@ -122,7 +124,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, } if (!through_db) { tb->Finish(); - file_writer->Close(); + file_writer->Close(IOOptions()); } else { db->Flush(FlushOptions()); } diff --git a/table/table_test.cc b/table/table_test.cc index e6f95243e1e..02a8d899d77 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -10,10 +10,10 @@ #include "rocksdb/table.h" #include -#include -#include #include +#include +#include #include #include #include @@ -52,6 +52,7 @@ #include "table/block_based/block.h" #include "table/block_based/block_based_table_builder.h" #include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_iterator.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" #include "table/block_based/filter_policy_internal.h" @@ -62,7 +63,6 @@ #include "table/internal_iterator.h" #include "table/meta_blocks.h" #include "table/plain/plain_table_factory.h" -#include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" #include "table/unique_id_impl.h" #include "test_util/sync_point.h" @@ -78,11 +78,6 @@ namespace ROCKSDB_NAMESPACE { -extern const uint64_t kLegacyBlockBasedTableMagicNumber; -extern const uint64_t kLegacyPlainTableMagicNumber; -extern const uint64_t kBlockBasedTableMagicNumber; -extern const uint64_t kPlainTableMagicNumber; - namespace { const std::string kDummyValue(10000, 'o'); @@ -186,7 +181,7 @@ class Constructor { public: explicit Constructor(const Comparator* cmp) : data_(stl_wrappers::LessOfComparator(cmp)) {} - virtual ~Constructor() {} + virtual ~Constructor() = default; void Add(const std::string& key, const Slice& value) { data_[key] = value.ToString(); @@ -295,8 +290,8 @@ class KeyConvertingIterator : public InternalIterator { bool arena_mode_; // No copying allowed - KeyConvertingIterator(const KeyConvertingIterator&); - void operator=(const KeyConvertingIterator&); + KeyConvertingIterator(const KeyConvertingIterator&) = delete; + void operator=(const KeyConvertingIterator&) = delete; }; // `BlockConstructor` APIs always accept/return user keys. @@ -345,7 +340,7 @@ class BlockConstructor : public Constructor { std::string data_; Block* block_; - BlockConstructor(); + BlockConstructor() = delete; }; class TableConstructor : public Constructor { @@ -372,19 +367,22 @@ class TableConstructor : public Constructor { file_writer_.reset(new WritableFileWriter( std::move(sink), "" /* don't care */, FileOptions())); std::unique_ptr builder; - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; if (largest_seqno_ != 0) { // Pretend that it's an external file written by SstFileWriter. - int_tbl_prop_collector_factories.emplace_back( + internal_tbl_prop_coll_factories.emplace_back( new SstFileWriterPropertiesCollectorFactory(2 /* version */, 0 /* global_seqno*/)); } std::string column_family_name; + const ReadOptions read_options; + const WriteOptions write_options; builder.reset(ioptions.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, internal_comparator, - &int_tbl_prop_collector_factories, + TableBuilderOptions(ioptions, moptions, read_options, write_options, + internal_comparator, + &internal_tbl_prop_coll_factories, options.compression, options.compression_opts, kUnknownColumnFamily, column_family_name, level_), file_writer_.get())); @@ -401,7 +399,7 @@ class TableConstructor : public Constructor { EXPECT_OK(builder->status()); } Status s = builder->Finish(); - EXPECT_OK(file_writer_->Flush()); + EXPECT_OK(file_writer_->Flush(IOOptions())); EXPECT_TRUE(s.ok()) << s.ToString(); EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize()); @@ -469,6 +467,7 @@ class TableConstructor : public Constructor { } BlockCacheTracer block_cache_tracer_; + Env* env_; private: void Reset() { @@ -487,11 +486,10 @@ class TableConstructor : public Constructor { bool convert_to_internal_key_; int level_; - TableConstructor(); + TableConstructor() = delete; static uint64_t cur_file_num_; EnvOptions soptions; - Env* env_; }; uint64_t TableConstructor::cur_file_num_ = 1; @@ -535,7 +533,9 @@ class MemTableConstructor : public Constructor { InternalIterator* NewIterator( const SliceTransform* /*prefix_extractor*/) const override { return new KeyConvertingIterator( - memtable_->NewIterator(ReadOptions(), &arena_), true); + memtable_->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr, + &arena_), + true); } bool AnywayDeleteIterator() const override { return true; } @@ -930,13 +930,17 @@ class HarnessTest : public testing::Test { InternalIterator* iter = constructor_->NewIterator(); ASSERT_TRUE(!iter->Valid()); stl_wrappers::KVMap::const_iterator model_iter = data.begin(); - if (kVerbose) fprintf(stderr, "---\n"); + if (kVerbose) { + fprintf(stderr, "---\n"); + } for (int i = 0; i < 200; i++) { const int toss = rnd->Uniform(support_prev_ ? 5 : 3); switch (toss) { case 0: { if (iter->Valid()) { - if (kVerbose) fprintf(stderr, "Next\n"); + if (kVerbose) { + fprintf(stderr, "Next\n"); + } iter->Next(); ASSERT_OK(iter->status()); ++model_iter; @@ -946,7 +950,9 @@ class HarnessTest : public testing::Test { } case 1: { - if (kVerbose) fprintf(stderr, "SeekToFirst\n"); + if (kVerbose) { + fprintf(stderr, "SeekToFirst\n"); + } iter->SeekToFirst(); ASSERT_OK(iter->status()); model_iter = data.begin(); @@ -957,8 +963,9 @@ class HarnessTest : public testing::Test { case 2: { std::string key = PickRandomKey(rnd, keys); model_iter = data.lower_bound(key); - if (kVerbose) + if (kVerbose) { fprintf(stderr, "Seek '%s'\n", EscapeString(key).c_str()); + } iter->Seek(Slice(key)); ASSERT_OK(iter->status()); ASSERT_EQ(ToString(data, model_iter), ToString(iter)); @@ -967,7 +974,9 @@ class HarnessTest : public testing::Test { case 3: { if (iter->Valid()) { - if (kVerbose) fprintf(stderr, "Prev\n"); + if (kVerbose) { + fprintf(stderr, "Prev\n"); + } iter->Prev(); ASSERT_OK(iter->status()); if (model_iter == data.begin()) { @@ -981,7 +990,9 @@ class HarnessTest : public testing::Test { } case 4: { - if (kVerbose) fprintf(stderr, "SeekToLast\n"); + if (kVerbose) { + fprintf(stderr, "SeekToLast\n"); + } iter->SeekToLast(); ASSERT_OK(iter->status()); if (keys.empty()) { @@ -1125,9 +1136,7 @@ class BlockBasedTableTest : public BlockBasedTableTestBase, virtual public ::testing::WithParamInterface { public: - BlockBasedTableTest() : format_(GetParam()) { - env_ = ROCKSDB_NAMESPACE::Env::Default(); - } + BlockBasedTableTest() : format_(GetParam()) { env_ = Env::Default(); } BlockBasedTableOptions GetBlockBasedTableOptions() { BlockBasedTableOptions options; @@ -1253,7 +1262,7 @@ class FileChecksumTestHelper { public: FileChecksumTestHelper(bool convert_to_internal_key = false) : convert_to_internal_key_(convert_to_internal_key) {} - ~FileChecksumTestHelper() {} + ~FileChecksumTestHelper() = default; void CreateWritableFile() { sink_ = new test::StringSink(); @@ -1299,7 +1308,7 @@ class FileChecksumTestHelper { EXPECT_TRUE(table_builder_->status().ok()); } Status s = table_builder_->Finish(); - EXPECT_OK(file_writer_->Flush()); + EXPECT_OK(file_writer_->Flush(IOOptions())); EXPECT_OK(s); EXPECT_EQ(sink_->contents().size(), table_builder_->FileSize()); @@ -1307,7 +1316,7 @@ class FileChecksumTestHelper { } std::string GetFileChecksum() { - EXPECT_OK(file_writer_->Close()); + EXPECT_OK(file_writer_->Close(IOOptions())); return table_builder_->GetFileChecksum(); } @@ -1437,7 +1446,7 @@ TestIds GetUniqueId(TableProperties* tp, std::unordered_set* seen, std::string euid; EXPECT_OK(GetExtendedUniqueIdFromTableProperties(*tp, &euid)); EXPECT_EQ(euid.size(), 24U); - t.external_id[0] = DecodeFixed64(&euid[0]); + t.external_id[0] = DecodeFixed64(euid.data()); t.external_id[1] = DecodeFixed64(&euid[8]); t.external_id[2] = DecodeFixed64(&euid[16]); @@ -1445,7 +1454,7 @@ TestIds GetUniqueId(TableProperties* tp, std::unordered_set* seen, EXPECT_OK(GetUniqueIdFromTableProperties(*tp, &uid)); EXPECT_EQ(uid.size(), 16U); EXPECT_EQ(uid, euid.substr(0, 16)); - EXPECT_EQ(t.external_id[0], DecodeFixed64(&uid[0])); + EXPECT_EQ(t.external_id[0], DecodeFixed64(uid.data())); EXPECT_EQ(t.external_id[1], DecodeFixed64(&uid[8])); } // All these should be effectively random @@ -1930,19 +1939,19 @@ void AssertKeysInCache(BlockBasedTable* table_reader, const std::vector& keys_not_in_cache, bool convert = false) { if (convert) { - for (auto key : keys_in_cache) { + for (const auto& key : keys_in_cache) { InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode())); } - for (auto key : keys_not_in_cache) { + for (const auto& key : keys_not_in_cache) { InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode())); } } else { - for (auto key : keys_in_cache) { + for (const auto& key : keys_in_cache) { ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); } - for (auto key : keys_not_in_cache) { + for (const auto& key : keys_not_in_cache) { ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key)); } } @@ -3140,6 +3149,470 @@ TEST_P(BlockBasedTableTest, TracingGetTest) { c.ResetTableReader(); } +void GenerateKVMap(TableConstructor* c) { + int num_block = 100; + Random rnd(101); + uint32_t key = 0; + for (int block = 0; block < num_block; block++) { + for (int i = 0; i < 16; i++) { + char k[9] = {0}; + // Internal key is constructed directly from this key, + // and internal key size is required to be >= 8 bytes, + // so use %08u as the format string. + snprintf(k, sizeof(k), "%08u", key); + std::string v = rnd.RandomString(256); + InternalKey ikey(std::string(k), 0, kTypeValue); + c->Add(ikey.Encode().ToString(), rnd.RandomString(256)); + key++; + } + } +} + +void WarmUpCache(TableConstructor* c, const MutableCFOptions& moptions, + const std::vector& warm_keys) { + ReadOptions ro; + std::unique_ptr iter(c->GetTableReader()->NewIterator( + ro, moptions.prefix_extractor.get(), nullptr, false, + TableReaderCaller::kUncategorized)); + size_t i = 0; + while (i < warm_keys.size()) { + InternalKey ikey(warm_keys[i], 0, kTypeValue); + iter->Seek(ikey.Encode().ToString()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + i++; + } +} + +TEST_P(BlockBasedTableTest, BlockCacheLookupSeqScans) { + Options options; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.create_if_missing = true; + options.statistics = CreateDBStatistics(); + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.block_cache = NewLRUCache(1024 * 1024, 0); + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + table_options.block_align = true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + TableConstructor c(BytewiseComparator()); + GenerateKVMap(&c); + + std::vector keys; + stl_wrappers::KVMap kvmap; + ImmutableOptions ioptions(options); + MutableCFOptions moptions(options); + const InternalKeyComparator internal_comparator(options.comparator); + + c.Finish(options, ioptions, moptions, table_options, internal_comparator, + &keys, &kvmap); + + BlockBasedTable* bbt = static_cast(c.GetTableReader()); + BlockHandle block_handle; + + ReadOptions read_options; + read_options.auto_readahead_size = true; + Slice ub = Slice("00000805"); + Slice* ub_ptr = &ub; + read_options.iterate_upper_bound = ub_ptr; + read_options.readahead_size = 16384; + + // Test various functionalities - + // 5 blocks prefetched - Current + 4 additional (readahead_size). + { + // Check the behavior when it's - + // Miss(200), Hit(210), Hit(225), Hit(240), Hit(255). + // It should only prefetch current block (200). + { + std::vector warm_keys{"00000210", "00000225", "00000240", + "00000255"}; + WarmUpCache(&c, moptions, warm_keys); + + ASSERT_OK(options.statistics->Reset()); + + std::unique_ptr iter(c.GetTableReader()->NewIterator( + read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + // Seek key - + InternalKey ikey("00000200", 0, kTypeValue); + auto kv_iter = kvmap.find(ikey.Encode().ToString()); + + iter->Seek(kv_iter->first); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), kv_iter->first); + ASSERT_EQ(iter->value().ToString(), kv_iter->second); + + FilePrefetchBuffer* prefetch_buffer = + (static_cast(iter.get())) + ->prefetch_buffer(); + std::vector> buffer_info(1); + prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info); + + bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, block_handle); + // It won't prefetch the data of cache hit. + // One block data. + ASSERT_EQ(buffer_info[0].second, 4096); + ASSERT_EQ(buffer_info[0].first, block_handle.offset()); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 1); + } + + { + // Check the behavior when it's - + // First Prefetch - Miss(315), Miss(330), Miss(345), Hit(360), Hit(375), + // Second Prefetch - Miss(390), Miss(405) ... + // First prefetch should only prefetch from 315 to 345. + std::vector warm_keys{"00000360", "00000375"}; + WarmUpCache(&c, moptions, warm_keys); + + std::unique_ptr iter(c.GetTableReader()->NewIterator( + read_options, moptions.prefix_extractor.get(), nullptr, false, + TableReaderCaller::kUncategorized)); + + // Seek key - + InternalKey ikey("00000315", 0, kTypeValue); + auto kv_iter = kvmap.find(ikey.Encode().ToString()); + + iter->Seek(kv_iter->first); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), kv_iter->first); + ASSERT_EQ(iter->value().ToString(), kv_iter->second); + + FilePrefetchBuffer* prefetch_buffer = + (static_cast(iter.get())) + ->prefetch_buffer(); + std::vector> buffer_info(1); + prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info); + bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, block_handle); + + // It won't prefetch the data of cache hit. + // 3 blocks data. + ASSERT_EQ(buffer_info[0].second, 12288); + ASSERT_EQ(buffer_info[0].first, block_handle.offset()); + + for (; kv_iter != kvmap.end() && iter->Valid(); kv_iter++) { + ASSERT_EQ(iter->key(), kv_iter->first); + ASSERT_EQ(iter->value().ToString(), kv_iter->second); + iter->Next(); + ASSERT_OK(iter->status()); + + if (iter->user_key().ToString() == "00000400") { + break; + } + } + + // Second Prefetch. + prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info); + bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, block_handle); + ASSERT_EQ(buffer_info[0].second, 20480); + ASSERT_EQ(buffer_info[0].first, block_handle.offset()); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 1); + } + } + c.ResetTableReader(); +} + +TEST_P(BlockBasedTableTest, BlockCacheLookupAsyncScansSeek) { + Options options; + TableConstructor c(BytewiseComparator()); + std::unique_ptr env( + new CompositeEnvWrapper(c.env_, FileSystem::Default())); + options.env = env.get(); + options.statistics = CreateDBStatistics(); + c.env_ = env.get(); + + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.create_if_missing = true; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.block_cache = NewLRUCache(1024 * 1024, 0); + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + table_options.block_align = true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + GenerateKVMap(&c); + + std::vector keys; + stl_wrappers::KVMap kvmap; + ImmutableOptions ioptions(options); + MutableCFOptions moptions(options); + const InternalKeyComparator internal_comparator(options.comparator); + + c.Finish(options, ioptions, moptions, table_options, internal_comparator, + &keys, &kvmap); + + BlockBasedTable* bbt = static_cast(c.GetTableReader()); + BlockHandle block_handle; + + ReadOptions read_options; + read_options.auto_readahead_size = true; + Slice ub = Slice("00000805"); + Slice* ub_ptr = &ub; + read_options.iterate_upper_bound = ub_ptr; + read_options.readahead_size = 16384; + read_options.async_io = true; + + // Test Various functionalities - + // 3 blocks prefetched - Current + 2 additional (readahead_size/2). + { + // Check the behavior when it's - + // 1st Prefetch - Miss(200), Hit(210), Hit(225), + // 2nd Prefetch - Hit(240), Hit(255) + // First Prefetch will be for 200 offset. + // Second prefetch will be 0. + { + std::vector warm_keys{"00000210", "00000225", "00000240", + "00000255"}; + WarmUpCache(&c, moptions, warm_keys); + + ASSERT_OK(options.statistics->Reset()); + + std::unique_ptr iter(c.GetTableReader()->NewIterator( + read_options, moptions.prefix_extractor.get(), nullptr, false, + TableReaderCaller::kUncategorized)); + + // Seek key - + InternalKey ikey("00000200", 0, kTypeValue); + auto kv_iter = kvmap.find(ikey.Encode().ToString()); + + iter->Seek(kv_iter->first); + ASSERT_TRUE(iter->status().IsTryAgain()); + iter->Seek(kv_iter->first); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), kv_iter->first); + ASSERT_EQ(iter->value().ToString(), kv_iter->second); + + FilePrefetchBuffer* prefetch_buffer = + (static_cast(iter.get())) + ->prefetch_buffer(); + std::vector> buffer_info(2); + prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info); + + bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, block_handle); + ASSERT_EQ(buffer_info[0].first, block_handle.offset()); + ASSERT_EQ(buffer_info[0].second, 4096); + ASSERT_EQ(buffer_info[1].second, 0); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 2); + } + { + // Check the behavior when it's - + // First Prefetch - Miss(315), Miss(330), Hit(345), + // Second Prefetch - Miss(360), Miss(375), ... + // First prefetch should only prefetch from 315 to 330. + // Second prefetch should start from 360. + std::vector warm_keys{"00000345"}; + WarmUpCache(&c, moptions, warm_keys); + + std::unique_ptr iter(c.GetTableReader()->NewIterator( + read_options, moptions.prefix_extractor.get(), nullptr, false, + TableReaderCaller::kUncategorized)); + + // Seek key - + InternalKey ikey("00000315", 0, kTypeValue); + auto kv_iter = kvmap.find(ikey.Encode().ToString()); + + iter->Seek(kv_iter->first); + ASSERT_TRUE(iter->status().IsTryAgain()); + iter->Seek(kv_iter->first); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), kv_iter->first); + ASSERT_EQ(iter->value().ToString(), kv_iter->second); + + FilePrefetchBuffer* prefetch_buffer = + (static_cast(iter.get())) + ->prefetch_buffer(); + std::vector> buffer_info(2); + prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info); + { + // 1st Buffer Verification. + bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, + block_handle); + ASSERT_EQ(buffer_info[0].first, block_handle.offset()); + ASSERT_EQ(buffer_info[0].second, 8192); + + // 2nd Buffer Verification. + InternalKey ikey_tmp("00000360", 0, kTypeValue); + bbt->TEST_GetDataBlockHandle(read_options, ikey_tmp.Encode().ToString(), + block_handle); + ASSERT_EQ(buffer_info[1].first, block_handle.offset()); + ASSERT_EQ(buffer_info[1].second, 8192); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 1); + } + } + + { + // Check the behavior when it's - + // First Prefetch - Miss(495), Miss(510), Hit(525), prefetch len- 8192 + // Second Prefetch async - Miss(540), Miss(555), - 8192 + // Third Prefetch Async - Hit(570), Miss(585), - 4096 + // 4th Prefetch Async - Hit(600), Miss(615), - 4096 + // 5th Prefetch Async - Miss(630), Miss(645) - 8192 + std::vector warm_keys{"00000525", "00000570", "00000600"}; + WarmUpCache(&c, moptions, warm_keys); + + std::unique_ptr iter(c.GetTableReader()->NewIterator( + read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + // Seek key - + InternalKey ikey("00000495", 0, kTypeValue); + auto kv_iter = kvmap.find(ikey.Encode().ToString()); + + // First and Second Prefetch. + iter->Seek(kv_iter->first); + ASSERT_TRUE(iter->status().IsTryAgain()); + iter->Seek(kv_iter->first); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), kv_iter->first); + ASSERT_EQ(iter->value().ToString(), kv_iter->second); + + FilePrefetchBuffer* prefetch_buffer = + (static_cast(iter.get())) + ->prefetch_buffer(); + + { + std::vector> buffer_info(2); + prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info); + + // 1st Buffer Verification. + bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, + block_handle); + ASSERT_EQ(buffer_info[0].first, block_handle.offset()); + ASSERT_EQ(buffer_info[0].second, 8192); + + // 2nd Buffer Verification. + InternalKey ikey_tmp("00000540", 0, kTypeValue); + bbt->TEST_GetDataBlockHandle(read_options, ikey_tmp.Encode().ToString(), + block_handle); + ASSERT_EQ(buffer_info[1].first, block_handle.offset()); + ASSERT_EQ(buffer_info[1].second, 8192); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 1); + } + + // Third prefetch ReadAsync (buffers will swap). + for (; kv_iter != kvmap.end() && iter->Valid(); kv_iter++) { + ASSERT_EQ(iter->key(), kv_iter->first); + ASSERT_EQ(iter->value().ToString(), kv_iter->second); + + if (iter->user_key() == "00000540") { + break; + } + + iter->Next(); + ASSERT_OK(iter->status()); + } + + { + std::vector> buffer_info(2); + prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info); + + // 1st Buffer Verification. + bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, + block_handle); + ASSERT_EQ(buffer_info[0].first, block_handle.offset()); + ASSERT_EQ(buffer_info[0].second, 8192); + + // 2nd Buffer Verification. + InternalKey ikey_tmp("00000585", 0, kTypeValue); + bbt->TEST_GetDataBlockHandle(read_options, ikey_tmp.Encode().ToString(), + block_handle); + ASSERT_EQ(buffer_info[1].first, block_handle.offset()); + ASSERT_EQ(buffer_info[1].second, 4096); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 1); + } + + // 4th Prefetch ReadAsync (buffers will swap). + for (; kv_iter != kvmap.end() && iter->Valid(); kv_iter++) { + ASSERT_EQ(iter->key(), kv_iter->first); + ASSERT_EQ(iter->value().ToString(), kv_iter->second); + + if (iter->user_key() == "00000585") { + break; + } + + iter->Next(); + ASSERT_OK(iter->status()); + } + + { + std::vector> buffer_info(2); + prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info); + + // 1st Buffer Verification. + bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, + block_handle); + ASSERT_EQ(buffer_info[0].first, block_handle.offset()); + ASSERT_EQ(buffer_info[0].second, 4096); + + // 2nd Buffer Verification. + InternalKey ikey_tmp("00000615", 0, kTypeValue); + bbt->TEST_GetDataBlockHandle(read_options, ikey_tmp.Encode().ToString(), + block_handle); + ASSERT_EQ(buffer_info[1].first, block_handle.offset()); + ASSERT_EQ(buffer_info[1].second, 4096); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 1); + } + + // 5th Prefetch ReadAsync. + for (; kv_iter != kvmap.end() && iter->Valid(); kv_iter++) { + ASSERT_EQ(iter->key(), kv_iter->first); + ASSERT_EQ(iter->value().ToString(), kv_iter->second); + + if (iter->user_key() == "00000615") { + break; + } + + iter->Next(); + ASSERT_OK(iter->status()); + } + + { + std::vector> buffer_info(2); + prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info); + + // 1st Buffer Verification. + bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, + block_handle); + ASSERT_EQ(buffer_info[0].first, block_handle.offset()); + ASSERT_EQ(buffer_info[0].second, 4096); + + // 2nd Buffer Verification. + InternalKey ikey_tmp("00000630", 0, kTypeValue); + bbt->TEST_GetDataBlockHandle(read_options, ikey_tmp.Encode().ToString(), + block_handle); + ASSERT_EQ(buffer_info[1].first, block_handle.offset()); + ASSERT_EQ(buffer_info[1].second, 8192); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 0); + } + } + } + c.ResetTableReader(); +} + struct HitMissCountingCache : public CacheWrapper { using CacheWrapper::CacheWrapper; const char* Name() const override { return "HitMissCountingCache"; } @@ -3246,8 +3719,8 @@ TEST_P(BlockBasedTableTest, TracingMultiGetTest) { std::vector get_contexts; get_contexts.emplace_back( options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, - ukeys[0], &values[0], nullptr, nullptr, nullptr, true, nullptr, nullptr, - nullptr, nullptr, nullptr, nullptr, get_id_offset); + ukeys[0], values.data(), nullptr, nullptr, nullptr, true, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, get_id_offset); get_contexts.emplace_back( options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, ukeys[1], &values[1], nullptr, nullptr, nullptr, true, nullptr, nullptr, @@ -3258,12 +3731,12 @@ TEST_P(BlockBasedTableTest, TracingMultiGetTest) { std::array statuses; autovector key_context; key_context.emplace_back(/*ColumnFamilyHandle omitted*/ nullptr, ukeys[0], - &values[0], + values.data(), /*PinnableWideColumns omitted*/ nullptr, - /*timestamp omitted*/ nullptr, &statuses[0]); + /*timestamp omitted*/ nullptr, statuses.data()); key_context[0].ukey_without_ts = ukeys[0]; key_context[0].ikey = encoded_keys[0]; - key_context[0].get_context = &get_contexts[0]; + key_context[0].get_context = get_contexts.data(); key_context.emplace_back(/*ColumnFamilyHandle omitted*/ nullptr, ukeys[1], &values[1], /*PinnableWideColumns omitted*/ nullptr, @@ -3302,8 +3775,8 @@ TEST_P(BlockBasedTableTest, TracingMultiGetTest) { record.block_type = TraceType::kBlockTraceFilterBlock; expected_records.push_back(record); } - // Then we should have three records for one index, one filter, and one data - // block access. (The two keys share a data block.) + // Then we should have three records for one index, one filter, and one + // data block access. (The two keys share a data block.) record.get_id = get_id_offset; record.block_type = TraceType::kBlockTraceFilterBlock; record.caller = TableReaderCaller::kUserMultiGet; @@ -3419,8 +3892,8 @@ TEST_P(BlockBasedTableTest, TracingIterator) { record.is_cache_hit = false; expected_records.push_back(record); expected_records.push_back(record); - // When we iterate this file for the second time, we should observe all cache - // hits. + // When we iterate this file for the second time, we should observe all + // cache hits. record.block_type = TraceType::kBlockTraceIndexBlock; record.is_cache_hit = true; expected_records.push_back(record); @@ -3494,8 +3967,8 @@ class BlockCachePropertiesSnapshot { int64_t block_cache_bytes_write = 0; }; -// Make sure, by default, index/filter blocks were pre-loaded (meaning we won't -// use block cache to store them). +// Make sure, by default, index/filter blocks were pre-loaded (meaning we +// won't use block cache to store them). TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) { Options options; options.create_if_missing = true; @@ -3741,7 +4214,8 @@ void ValidateBlockRestartInterval(int value, int expected) { } TEST_P(BlockBasedTableTest, InvalidOptions) { - // invalid values for block_size_deviation (<0 or >100) are silently set to 0 + // invalid values for block_size_deviation (<0 or >100) are silently set to + // 0 ValidateBlockSizeDeviation(-10, 0); ValidateBlockSizeDeviation(-1, 0); ValidateBlockSizeDeviation(0, 0); @@ -3849,8 +4323,8 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) { TEST_P(BlockBasedTableTest, BlockCacheLeak) { // Check that when we reopen a table we don't lose access to blocks already - // in the cache. This test checks whether the Table actually makes use of the - // unique ID from the file. + // in the cache. This test checks whether the Table actually makes use of + // the unique ID from the file. Options opt; std::unique_ptr ikc; @@ -3972,15 +4446,17 @@ TEST_P(BlockBasedTableTest, NoFileChecksum) { std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); int level = 0; - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; std::string column_family_name; FileChecksumTestHelper f(true); f.CreateWritableFile(); std::unique_ptr builder; + const ReadOptions read_options; + const WriteOptions write_options; builder.reset(ioptions.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, *comparator, - &int_tbl_prop_collector_factories, + TableBuilderOptions(ioptions, moptions, read_options, write_options, + *comparator, &internal_tbl_prop_coll_factories, options.compression, options.compression_opts, kUnknownColumnFamily, column_family_name, level), f.GetFileWriter())); @@ -4002,7 +4478,7 @@ TEST_P(BlockBasedTableTest, Crc32cFileChecksum) { std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); int level = 0; - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; std::string column_family_name; FileChecksumGenContext gen_context; @@ -4014,9 +4490,11 @@ TEST_P(BlockBasedTableTest, Crc32cFileChecksum) { f.CreateWritableFile(); f.SetFileChecksumGenerator(checksum_crc32c_gen1.release()); std::unique_ptr builder; + const ReadOptions read_options; + const WriteOptions write_options; builder.reset(ioptions.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, *comparator, - &int_tbl_prop_collector_factories, + TableBuilderOptions(ioptions, moptions, read_options, write_options, + *comparator, &internal_tbl_prop_coll_factories, options.compression, options.compression_opts, kUnknownColumnFamily, column_family_name, level), f.GetFileWriter())); @@ -4057,12 +4535,14 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; std::string column_family_name; int unknown_level = -1; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(factory.NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, - &int_tbl_prop_collector_factories, kNoCompression, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, + &internal_tbl_prop_coll_factories, kNoCompression, CompressionOptions(), kUnknownColumnFamily, column_family_name, unknown_level), file_writer.get())); @@ -4074,7 +4554,7 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { builder->Add(key, value); } ASSERT_OK(builder->Finish()); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Flush(IOOptions())); test::StringSink* ss = static_cast(file_writer->writable_file()); @@ -4084,7 +4564,6 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { new RandomAccessFileReader(std::move(source), "test")); std::unique_ptr props; - const ReadOptions read_options; auto s = ReadTableProperties(file_reader.get(), ss->contents().size(), kPlainTableMagicNumber, ioptions, read_options, &props); @@ -4109,15 +4588,16 @@ TEST_F(PlainTableTest, NoFileChecksum) { const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; std::string column_family_name; int unknown_level = -1; FileChecksumTestHelper f(true); f.CreateWritableFile(); - + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(factory.NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, - &int_tbl_prop_collector_factories, kNoCompression, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, + &internal_tbl_prop_coll_factories, kNoCompression, CompressionOptions(), kUnknownColumnFamily, column_family_name, unknown_level), f.GetFileWriter())); @@ -4142,7 +4622,7 @@ TEST_F(PlainTableTest, Crc32cFileChecksum) { const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; std::string column_family_name; int unknown_level = -1; @@ -4154,10 +4634,11 @@ TEST_F(PlainTableTest, Crc32cFileChecksum) { FileChecksumTestHelper f(true); f.CreateWritableFile(); f.SetFileChecksumGenerator(checksum_crc32c_gen1.release()); - + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(factory.NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, - &int_tbl_prop_collector_factories, kNoCompression, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, + &internal_tbl_prop_coll_factories, kNoCompression, CompressionOptions(), kUnknownColumnFamily, column_family_name, unknown_level), f.GetFileWriter())); @@ -4174,7 +4655,6 @@ TEST_F(PlainTableTest, Crc32cFileChecksum) { EXPECT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str()); } - TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) { TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); c.Add("k01", "hello"); @@ -4227,8 +4707,10 @@ static void DoCompressionTest(CompressionType comp) { Options options; test::PlainInternalKeyComparator ikc(options.comparator); options.compression = comp; + options.db_host_id = ""; BlockBasedTableOptions table_options; table_options.block_size = 1024; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap); @@ -4303,7 +4785,8 @@ TEST_F(GeneralTableTest, ApproximateKeyAnchors) { std::vector anchors; ASSERT_OK(c.GetTableReader()->ApproximateKeyAnchors(ReadOptions(), anchors)); - // The target is 128 anchors. But in reality it can be slightly more or fewer. + // The target is 128 anchors. But in reality it can be slightly more or + // fewer. ASSERT_GT(anchors.size(), 120); ASSERT_LT(anchors.size(), 140); @@ -4411,12 +4894,13 @@ TEST_F(MemTableTest, Simple) { for (int i = 0; i < 2; ++i) { Arena arena; - ScopedArenaIterator arena_iter_guard; + ScopedArenaPtr arena_iter_guard; std::unique_ptr iter_guard; InternalIterator* iter; if (i == 0) { - iter = GetMemTable()->NewIterator(ReadOptions(), &arena); - arena_iter_guard.set(iter); + iter = GetMemTable()->NewIterator( + ReadOptions(), /*seqno_to_time_mapping=*/nullptr, &arena); + arena_iter_guard.reset(iter); } else { iter = GetMemTable()->NewRangeTombstoneIterator( ReadOptions(), kMaxSequenceNumber /* read_seq */, @@ -4660,14 +5144,15 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) { class PrefixTest : public testing::Test { public: PrefixTest() : testing::Test() {} - ~PrefixTest() override {} + ~PrefixTest() override = default; }; namespace { // A simple PrefixExtractor that only works for test PrefixAndWholeKeyTest class TestPrefixExtractor : public ROCKSDB_NAMESPACE::SliceTransform { public: - ~TestPrefixExtractor() override{}; + ~TestPrefixExtractor() override = default; + ; const char* Name() const override { return "TestPrefixExtractor"; } ROCKSDB_NAMESPACE::Slice Transform( @@ -4758,14 +5243,16 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; - int_tbl_prop_collector_factories.emplace_back( + InternalTblPropCollFactories internal_tbl_prop_coll_factories; + internal_tbl_prop_coll_factories.emplace_back( new SstFileWriterPropertiesCollectorFactory(2 /* version */, 0 /* global_seqno*/)); std::string column_family_name; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(options.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, - &int_tbl_prop_collector_factories, kNoCompression, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, + &internal_tbl_prop_coll_factories, kNoCompression, CompressionOptions(), kUnknownColumnFamily, column_family_name, -1), file_writer.get())); @@ -4778,7 +5265,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { builder->Add(ik.Encode(), value); } ASSERT_OK(builder->Finish()); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Flush(IOOptions())); test::RandomRWStringSink ss_rw(sink); uint32_t version; @@ -4793,7 +5280,6 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { new RandomAccessFileReader(std::move(source), "")); std::unique_ptr props; - const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(), kBlockBasedTableMagicNumber, ioptions, read_options, &props)); @@ -4817,7 +5303,6 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { // Helper function to get the contents of the table InternalIterator std::unique_ptr table_reader; - const ReadOptions read_options; std::function GetTableInternalIter = [&]() { std::unique_ptr source( new test::StringSource(ss_rw.contents(), 73342, true)); @@ -4943,11 +5428,13 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; std::string column_family_name; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(options.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, - &int_tbl_prop_collector_factories, kNoCompression, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, + &internal_tbl_prop_coll_factories, kNoCompression, CompressionOptions(), kUnknownColumnFamily, column_family_name, -1), file_writer.get())); @@ -4962,7 +5449,7 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { builder->Add(ik.Encode(), value); } ASSERT_OK(builder->Finish()); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Flush(IOOptions())); std::unique_ptr source( new test::StringSource(sink->contents(), 73342, false)); @@ -4971,7 +5458,6 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { // Helper function to get version, global_seqno, global_seqno_offset std::function VerifyBlockAlignment = [&]() { std::unique_ptr props; - const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(), kBlockBasedTableMagicNumber, ioptions, read_options, &props)); @@ -4999,7 +5485,6 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { 0 /* block_protection_bytes_per_key */), std::move(file_reader), sink->contents().size(), &table_reader)); - ReadOptions read_options; std::unique_ptr db_iter(table_reader->NewIterator( read_options, moptions2.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kUncategorized)); @@ -5035,12 +5520,13 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; std::string column_family_name; - + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(options.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, - &int_tbl_prop_collector_factories, kNoCompression, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, + &internal_tbl_prop_coll_factories, kNoCompression, CompressionOptions(), kUnknownColumnFamily, column_family_name, -1), file_writer.get())); @@ -5055,7 +5541,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { builder->Add(ik.Encode(), value); } ASSERT_OK(builder->Finish()); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Flush(IOOptions())); std::unique_ptr source( new test::StringSource(sink->contents(), 73342, true)); @@ -5067,20 +5553,19 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { uint64_t file_size = sink->contents().size(); Footer footer; - IOOptions opts; - ASSERT_OK(ReadFooterFromFile(opts, file, *FileSystem::Default(), + ASSERT_OK(ReadFooterFromFile(IOOptions(), file, *FileSystem::Default(), nullptr /* prefetch_buffer */, file_size, &footer, kBlockBasedTableMagicNumber)); auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type, BlockContents* contents) { - ReadOptions read_options; - read_options.verify_checksums = false; + ReadOptions read_options_for_helper; + read_options_for_helper.verify_checksums = false; PersistentCacheOptions cache_options; BlockFetcher block_fetcher( - file, nullptr /* prefetch_buffer */, footer, read_options, handle, - contents, ioptions, false /* decompress */, + file, nullptr /* prefetch_buffer */, footer, read_options_for_helper, + handle, contents, ioptions, false /* decompress */, false /*maybe_compressed*/, block_type, UncompressionDict::GetEmptyDict(), cache_options); @@ -5390,8 +5875,8 @@ TEST_F(BBTTailPrefetchTest, TestTailPrefetchStats) { TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) { TailPrefetchStats tpstats; - FilePrefetchBuffer buffer(0 /* readahead_size */, 0 /* max_readahead_size */, - false /* enable */, true /* track_min_offset */); + FilePrefetchBuffer buffer(ReadaheadParams(), false /* enable */, + true /* track_min_offset */); IOOptions opts; buffer.TryReadFromCache(opts, nullptr /* reader */, 500 /* offset */, 10 /* n */, nullptr /* result */, @@ -5626,14 +6111,17 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) { ImmutableOptions ioptions(options); MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder( options.table_factory->NewTableBuilder( - TableBuilderOptions( - ioptions, moptions, ikc, &int_tbl_prop_collector_factories, - kSnappyCompression, options.compression_opts, - kUnknownColumnFamily, "test_cf", -1 /* level */), + TableBuilderOptions(ioptions, moptions, read_options, write_options, + ikc, &internal_tbl_prop_coll_factories, + kSnappyCompression, options.compression_opts, + kUnknownColumnFamily, "test_cf", + -1 /* level */), file_writer.get())); std::string key1 = "key1"; @@ -5702,11 +6190,13 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, ImmutableOptions ioptions(options); MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(options.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, - &int_tbl_prop_collector_factories, kSnappyCompression, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, + &internal_tbl_prop_coll_factories, kSnappyCompression, options.compression_opts, kUnknownColumnFamily, "test_cf", -1 /* level */), file_writer.get())); @@ -5787,11 +6277,13 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) { ImmutableOptions ioptions(options); MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; + const ReadOptions read_options; + const WriteOptions write_options; std::unique_ptr builder(options.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, ikc, - &int_tbl_prop_collector_factories, kSnappyCompression, + TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, + &internal_tbl_prop_coll_factories, kSnappyCompression, options.compression_opts, kUnknownColumnFamily, "test_cf", -1 /* level */), file_writer.get())); diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index 4b6634e5cfe..c66a94fb54e 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -70,7 +70,9 @@ class TwoLevelIndexIterator : public InternalIteratorBase { private: void SaveError(const Status& s) { - if (status_.ok() && !s.ok()) status_ = s; + if (status_.ok() && !s.ok()) { + status_ = s; + } } void SkipEmptyDataBlocksForward(); void SkipEmptyDataBlocksBackward(); diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index 1fed9341752..0614f57dac3 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -36,7 +36,7 @@ struct TwoLevelIteratorState { // Uses a supplied function to convert an index_iter value into // an iterator over the contents of the corresponding block. // Note: this function expects first_level_iter was not created using the arena -extern InternalIteratorBase* NewTwoLevelIterator( +InternalIteratorBase* NewTwoLevelIterator( TwoLevelIteratorState* state, InternalIteratorBase* first_level_iter); diff --git a/table/unique_id.cc b/table/unique_id.cc index fcdd756504a..8bfa8bcfd38 100644 --- a/table/unique_id.cc +++ b/table/unique_id.cc @@ -14,7 +14,7 @@ namespace ROCKSDB_NAMESPACE { std::string EncodeSessionId(uint64_t upper, uint64_t lower) { std::string db_session_id(20U, '\0'); - char *buf = &db_session_id[0]; + char *buf = db_session_id.data(); // Preserving `lower` is slightly tricky. 36^12 is slightly more than // 62 bits, so we use 12 chars plus the bottom two bits of one more. // (A tiny fraction of 20 digit strings go unused.) @@ -152,7 +152,7 @@ void ExternalUniqueIdToInternal(UniqueIdPtr in_out) { std::string EncodeUniqueIdBytes(UniqueIdPtr in) { std::string ret(in.extended ? 24U : 16U, '\0'); - EncodeFixed64(&ret[0], in.ptr[0]); + EncodeFixed64(ret.data(), in.ptr[0]); EncodeFixed64(&ret[8], in.ptr[1]); if (in.extended) { EncodeFixed64(&ret[16], in.ptr[2]); diff --git a/test_util/mock_time_env.cc b/test_util/mock_time_env.cc index 23888e69e3f..c8ed80347a2 100644 --- a/test_util/mock_time_env.cc +++ b/test_util/mock_time_env.cc @@ -25,9 +25,9 @@ void MockSystemClock::InstallTimedWaitFixCallback() { // but is interpreted in real clock time.) SyncPoint::GetInstance()->SetCallBack( "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); + uint64_t time_us = *static_cast(arg); if (time_us < this->RealNowMicros()) { - *reinterpret_cast(arg) = this->RealNowMicros() + 1000; + *static_cast(arg) = this->RealNowMicros() + 1000; } }); #endif // OS_MACOSX diff --git a/test_util/mock_time_env.h b/test_util/mock_time_env.h index 19bb9e76de9..701a92d6e5b 100644 --- a/test_util/mock_time_env.h +++ b/test_util/mock_time_env.h @@ -26,7 +26,7 @@ class MockSystemClock : public SystemClockWrapper { static const char* kClassName() { return "MockSystemClock"; } const char* Name() const override { return kClassName(); } - virtual Status GetCurrentTime(int64_t* time_sec) override { + Status GetCurrentTime(int64_t* time_sec) override { assert(time_sec != nullptr); *time_sec = static_cast(current_time_us_ / kMicrosInSecond); return Status::OK(); @@ -34,9 +34,9 @@ class MockSystemClock : public SystemClockWrapper { virtual uint64_t NowSeconds() { return current_time_us_ / kMicrosInSecond; } - virtual uint64_t NowMicros() override { return current_time_us_; } + uint64_t NowMicros() override { return current_time_us_; } - virtual uint64_t NowNanos() override { + uint64_t NowNanos() override { assert(current_time_us_ <= std::numeric_limits::max() / 1000); return current_time_us_ * 1000; } @@ -69,8 +69,8 @@ class MockSystemClock : public SystemClockWrapper { current_time_us_.fetch_add(micros); } - virtual bool TimedWait(port::CondVar* cv, - std::chrono::microseconds deadline) override { + bool TimedWait(port::CondVar* cv, + std::chrono::microseconds deadline) override { uint64_t now_micros = NowMicros(); uint64_t deadline_micros = static_cast(deadline.count()); uint64_t delay_micros; diff --git a/test_util/secondary_cache_test_util.cc b/test_util/secondary_cache_test_util.cc index 6f0bd384948..b5693de0590 100644 --- a/test_util/secondary_cache_test_util.cc +++ b/test_util/secondary_cache_test_util.cc @@ -7,9 +7,7 @@ #include -namespace ROCKSDB_NAMESPACE { - -namespace secondary_cache_test_util { +namespace ROCKSDB_NAMESPACE::secondary_cache_test_util { namespace { using TestItem = WithCacheType::TestItem; @@ -92,6 +90,4 @@ const Cache::CacheItemHelper* WithCacheType::GetHelperFail(CacheEntryRole r) { return GetHelper(r, true, true); } -} // namespace secondary_cache_test_util - -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::secondary_cache_test_util diff --git a/test_util/testharness.cc b/test_util/testharness.cc index 3c7b835d2f7..89c7fd9775a 100644 --- a/test_util/testharness.cc +++ b/test_util/testharness.cc @@ -13,8 +13,7 @@ #include #include -namespace ROCKSDB_NAMESPACE { -namespace test { +namespace ROCKSDB_NAMESPACE::test { #ifdef OS_WIN #include @@ -103,5 +102,4 @@ ::testing::AssertionResult AssertMatchesRegex(const char* str_expr, } } -} // namespace test -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::test diff --git a/test_util/testutil.cc b/test_util/testutil.cc index 1e771f4fd16..5372126ef5f 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -34,8 +34,7 @@ void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} #endif -namespace ROCKSDB_NAMESPACE { -namespace test { +namespace ROCKSDB_NAMESPACE::test { const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version; const std::set kFooterFormatVersionsToTest{ @@ -91,10 +90,12 @@ bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode) { return test_mode != UserDefinedTimestampTestMode::kStripUserDefinedTimestamp; } -extern Slice CompressibleString(Random* rnd, double compressed_fraction, - int len, std::string* dst) { +Slice CompressibleString(Random* rnd, double compressed_fraction, int len, + std::string* dst) { int raw = static_cast(len * compressed_fraction); - if (raw < 1) raw = 1; + if (raw < 1) { + raw = 1; + } std::string raw_data = rnd->RandomBinaryString(raw); // Duplicate the random data until we have filled "len" bytes @@ -109,7 +110,7 @@ extern Slice CompressibleString(Random* rnd, double compressed_fraction, namespace { class Uint64ComparatorImpl : public Comparator { public: - Uint64ComparatorImpl() {} + Uint64ComparatorImpl() = default; const char* Name() const override { return "rocksdb.Uint64Comparator"; } @@ -131,11 +132,9 @@ class Uint64ComparatorImpl : public Comparator { } void FindShortestSeparator(std::string* /*start*/, - const Slice& /*limit*/) const override { - return; - } + const Slice& /*limit*/) const override {} - void FindShortSuccessor(std::string* /*key*/) const override { return; } + void FindShortSuccessor(std::string* /*key*/) const override {} }; } // namespace @@ -370,6 +369,7 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options, cf_opt->memtable_whole_key_filtering = rnd->Uniform(2); cf_opt->enable_blob_files = rnd->Uniform(2); cf_opt->enable_blob_garbage_collection = rnd->Uniform(2); + cf_opt->strict_max_successive_merges = rnd->Uniform(2); // double options cf_opt->memtable_prefix_bloom_size_ratio = @@ -463,15 +463,16 @@ bool IsPrefetchSupported(const std::shared_ptr& fs, Random rnd(301); std::string test_string = rnd.RandomString(4096); Slice data(test_string); - Status s = WriteStringToFile(fs.get(), data, tmp, true); + IOOptions opts; + Status s = WriteStringToFile(fs.get(), data, tmp, true, opts); if (s.ok()) { std::unique_ptr file; auto io_s = fs->NewRandomAccessFile(tmp, FileOptions(), &file, nullptr); if (io_s.ok()) { - supported = !(file->Prefetch(0, data.size(), IOOptions(), nullptr) - .IsNotSupported()); + supported = + !(file->Prefetch(0, data.size(), opts, nullptr).IsNotSupported()); } - s = fs->DeleteFile(tmp, IOOptions(), nullptr); + s = fs->DeleteFile(tmp, opts, nullptr); } return s.ok() && supported; } @@ -521,7 +522,7 @@ Status CorruptFile(Env* env, const std::string& fname, int offset, for (int i = 0; i < bytes_to_corrupt; i++) { contents[i + offset] ^= 0x80; } - s = WriteStringToFile(env, contents, fname); + s = WriteStringToFile(env, contents, fname, false /* should_sync */); } if (s.ok() && verify_checksum) { Options options; @@ -544,7 +545,7 @@ Status TruncateFile(Env* env, const std::string& fname, uint64_t new_length) { s = ReadFileToString(env, fname, &contents); if (s.ok()) { contents.resize(static_cast(new_length), 'b'); - s = WriteStringToFile(env, contents, fname); + s = WriteStringToFile(env, contents, fname, false /* should_sync */); } return s; } @@ -590,13 +591,13 @@ class SpecialMemTableRep : public MemTableRep { num_entries_flush_(num_entries_flush), num_entries_(0) {} - virtual KeyHandle Allocate(const size_t len, char** buf) override { + KeyHandle Allocate(const size_t len, char** buf) override { return memtable_->Allocate(len, buf); } // Insert key into the list. // REQUIRES: nothing that compares equal to key is currently in the list. - virtual void Insert(KeyHandle handle) override { + void Insert(KeyHandle handle) override { num_entries_++; memtable_->Insert(handle); } @@ -607,19 +608,18 @@ class SpecialMemTableRep : public MemTableRep { } // Returns true iff an entry that compares equal to key is in the list. - virtual bool Contains(const char* key) const override { + bool Contains(const char* key) const override { return memtable_->Contains(key); } - virtual size_t ApproximateMemoryUsage() override { + size_t ApproximateMemoryUsage() override { // Return a high memory usage when number of entries exceeds the threshold // to trigger a flush. return (num_entries_ < num_entries_flush_) ? 0 : 1024 * 1024 * 1024; } - virtual void Get(const LookupKey& k, void* callback_args, - bool (*callback_func)(void* arg, - const char* entry)) override { + void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) override { memtable_->Get(k, callback_args, callback_func); } @@ -628,11 +628,11 @@ class SpecialMemTableRep : public MemTableRep { return memtable_->ApproximateNumEntries(start_ikey, end_ikey); } - virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { + MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { return memtable_->GetIterator(arena); } - virtual ~SpecialMemTableRep() override {} + ~SpecialMemTableRep() override = default; private: std::unique_ptr memtable_; @@ -647,7 +647,7 @@ class SpecialSkipListFactory : public MemTableRepFactory { .AddNumber(":"), [](const std::string& uri, std::unique_ptr* guard, std::string* /* errmsg */) { - auto colon = uri.find(":"); + auto colon = uri.find(':'); if (colon != std::string::npos) { auto count = ParseInt(uri.substr(colon + 1)); guard->reset(new SpecialSkipListFactory(count)); @@ -664,16 +664,17 @@ class SpecialSkipListFactory : public MemTableRepFactory { : num_entries_flush_(num_entries_flush) {} using MemTableRepFactory::CreateMemTableRep; - virtual MemTableRep* CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform* transform, Logger* /*logger*/) override { + MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& compare, + Allocator* allocator, + const SliceTransform* transform, + Logger* /*logger*/) override { return new SpecialMemTableRep( allocator, factory_.CreateMemTableRep(compare, allocator, transform, nullptr), num_entries_flush_); } static const char* kClassName() { return "SpecialSkipListFactory"; } - virtual const char* Name() const override { return kClassName(); } + const char* Name() const override { return kClassName(); } std::string GetId() const override { std::string id = Name(); if (num_entries_flush_ > 0) { @@ -747,5 +748,4 @@ void RegisterTestLibrary(const std::string& arg) { ObjectRegistry::Default()->AddLibrary("test", RegisterTestObjects, arg); } } -} // namespace test -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::test diff --git a/test_util/testutil.h b/test_util/testutil.h index eca1ff794e9..b3fa0954cbf 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -49,8 +49,8 @@ extern const std::set kFooterFormatVersionsToTest; // Return a random key with the specified length that may contain interesting // characters (e.g. \x00, \xff, etc.). enum RandomKeyType : char { RANDOM, LARGEST, SMALLEST, MIDDLE }; -extern std::string RandomKey(Random* rnd, int len, - RandomKeyType type = RandomKeyType::RANDOM); +std::string RandomKey(Random* rnd, int len, + RandomKeyType type = RandomKeyType::RANDOM); enum class UserDefinedTimestampTestMode { // Test does not enable user-defined timestamp feature. @@ -62,17 +62,17 @@ enum class UserDefinedTimestampTestMode { kStripUserDefinedTimestamp, }; -extern const std::vector& GetUDTTestModes(); +const std::vector& GetUDTTestModes(); -extern bool IsUDTEnabled(const UserDefinedTimestampTestMode& test_mode); +bool IsUDTEnabled(const UserDefinedTimestampTestMode& test_mode); -extern bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode); +bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode); // Store in *dst a string of length "len" that will compress to // "N*compressed_fraction" bytes and return a Slice that references // the generated data. -extern Slice CompressibleString(Random* rnd, double compressed_fraction, - int len, std::string* dst); +Slice CompressibleString(Random* rnd, double compressed_fraction, int len, + std::string* dst); #ifndef NDEBUG // An internal comparator that just forward comparing results from the @@ -86,7 +86,7 @@ class PlainInternalKeyComparator : public InternalKeyComparator { virtual ~PlainInternalKeyComparator() {} - virtual int Compare(const Slice& a, const Slice& b) const override { + int Compare(const Slice& a, const Slice& b) const override { return user_comparator()->Compare(a, b); } }; @@ -102,9 +102,9 @@ class SimpleSuffixReverseComparator : public Comparator { public: SimpleSuffixReverseComparator() {} static const char* kClassName() { return "SimpleSuffixReverseComparator"; } - virtual const char* Name() const override { return kClassName(); } + const char* Name() const override { return kClassName(); } - virtual int Compare(const Slice& a, const Slice& b) const override { + int Compare(const Slice& a, const Slice& b) const override { Slice prefix_a = Slice(a.data(), 8); Slice prefix_b = Slice(b.data(), 8); int prefix_comp = prefix_a.compare(prefix_b); @@ -116,10 +116,10 @@ class SimpleSuffixReverseComparator : public Comparator { return -(suffix_a.compare(suffix_b)); } } - virtual void FindShortestSeparator(std::string* /*start*/, - const Slice& /*limit*/) const override {} + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} - virtual void FindShortSuccessor(std::string* /*key*/) const override {} + void FindShortSuccessor(std::string* /*key*/) const override {} }; // Returns a user key comparator that can be used for comparing two uint64_t @@ -127,13 +127,13 @@ class SimpleSuffixReverseComparator : public Comparator { // at once. Assumes same endian-ness is used though the database's lifetime. // Symantics of comparison would differ from Bytewise comparator in little // endian machines. -extern const Comparator* Uint64Comparator(); +const Comparator* Uint64Comparator(); // A wrapper api for getting the ComparatorWithU64Ts -extern const Comparator* BytewiseComparatorWithU64TsWrapper(); +const Comparator* BytewiseComparatorWithU64TsWrapper(); // A wrapper api for getting the ComparatorWithU64Ts -extern const Comparator* ReverseBytewiseComparatorWithU64TsWrapper(); +const Comparator* ReverseBytewiseComparatorWithU64TsWrapper(); class StringSink : public FSWritableFile { public: @@ -189,6 +189,11 @@ class StringSink : public FSWritableFile { } } + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return contents_.size(); + } + private: Slice* reader_contents_; size_t last_flush_; @@ -285,6 +290,11 @@ class OverwritingStringSink : public FSWritableFile { if (last_flush_ > contents_.size()) last_flush_ = contents_.size(); } + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return contents_.size(); + } + private: std::string contents_; Slice* reader_contents_; @@ -359,20 +369,19 @@ class StringSource : public FSRandomAccessFile { class NullLogger : public Logger { public: using Logger::Logv; - virtual void Logv(const char* /*format*/, va_list /*ap*/) override {} - virtual size_t GetLogFileSize() const override { return 0; } + void Logv(const char* /*format*/, va_list /*ap*/) override {} + size_t GetLogFileSize() const override { return 0; } }; // Corrupts key by changing the type -extern void CorruptKeyType(InternalKey* ikey); +void CorruptKeyType(InternalKey* ikey); -extern std::string KeyStr(const std::string& user_key, - const SequenceNumber& seq, const ValueType& t, - bool corrupt = false); +std::string KeyStr(const std::string& user_key, const SequenceNumber& seq, + const ValueType& t, bool corrupt = false); -extern std::string KeyStr(uint64_t ts, const std::string& user_key, - const SequenceNumber& seq, const ValueType& t, - bool corrupt = false); +std::string KeyStr(uint64_t ts, const std::string& user_key, + const SequenceNumber& seq, const ValueType& t, + bool corrupt = false); class SleepingBackgroundTask { public: @@ -446,7 +455,7 @@ class SleepingBackgroundTask { } static void DoSleepTask(void* arg) { - reinterpret_cast(arg)->DoSleep(); + static_cast(arg)->DoSleep(); } private: @@ -563,6 +572,14 @@ class StringFS : public FileSystemWrapper { return IOStatus::OK(); } + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + if (contents_ != nullptr) { + return contents_->size(); + } + return 0; + } + private: std::string* contents_; }; @@ -731,14 +748,14 @@ class ChanglingMergeOperator : public MergeOperator { void SetName(const std::string& name) { name_ = name; } - virtual bool FullMergeV2(const MergeOperationInput& /*merge_in*/, - MergeOperationOutput* /*merge_out*/) const override { + bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* /*merge_out*/) const override { return false; } - virtual bool PartialMergeMulti(const Slice& /*key*/, - const std::deque& /*operand_list*/, - std::string* /*new_value*/, - Logger* /*logger*/) const override { + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& /*operand_list*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { return false; } static const char* kClassName() { return "ChanglingMergeOperator"; } @@ -752,7 +769,7 @@ class ChanglingMergeOperator : public MergeOperator { } } - virtual const char* Name() const override { return name_.c_str(); } + const char* Name() const override { return name_.c_str(); } protected: std::string name_; @@ -831,7 +848,7 @@ class ChanglingCompactionFilterFactory : public CompactionFilterFactory { // The factory for the hacky skip list mem table that triggers flush after // number of entries exceeds a threshold. -extern MemTableRepFactory* NewSpecialSkipListFactory(int num_entries_per_flush); +MemTableRepFactory* NewSpecialSkipListFactory(int num_entries_per_flush); CompressionType RandomCompressionType(Random* rnd); diff --git a/tools/benchmark.sh b/tools/benchmark.sh index 73d9e961356..b269e09b109 100755 --- a/tools/benchmark.sh +++ b/tools/benchmark.sh @@ -460,10 +460,12 @@ function start_stats { pspid=$! while :; do - b_gb=$( ls -l $DB_DIR 2> /dev/null | grep blob | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' ) - s_gb=$( ls -l $DB_DIR 2> /dev/null | grep sst | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' ) - l_gb=$( ls -l $WAL_DIR 2> /dev/null | grep log | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' ) - a_gb=$( ls -l $DB_DIR 2> /dev/null | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' ) + eval $( ls -l $DB_DIR 2> /dev/null | awk ' + BEGIN { gb = 1024*1024*1024 } + /blob/{ bb += $5 } /sst/{ sb += $5 } { ab += $5 } + END { printf "b_gb=%.1f s_gb=%.1f a_gb=%.1f", bb / gb, sb / gb, ab / gb } + ') + l_gb=$( ls -l $WAL_DIR 2> /dev/null | awk '/log/{ b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' ) ts=$( date +%H%M%S ) echo -e "${a_gb}\t${s_gb}\t${l_gb}\t${b_gb}\t${ts}" sleep 10 diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc index f2d4f05bea7..38532660b0a 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc @@ -577,7 +577,7 @@ void BlockCacheTraceAnalyzer::WriteSkewness( std::map> label_bucket_naccesses; std::vector> pairs; for (auto const& itr : label_naccesses) { - pairs.push_back(itr); + pairs.emplace_back(itr); } // Sort in descending order. sort(pairs.begin(), pairs.end(), @@ -1571,6 +1571,10 @@ Status BlockCacheTraceAnalyzer::Analyze() { miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access(access.caller), !access.is_cache_hit); + caller_miss_ratio_stats_map_[access.caller].UpdateMetrics( + access.access_timestamp, is_user_access(access.caller), + !access.is_cache_hit); + if (cache_simulator_) { cache_simulator_->Access(access); } @@ -1586,6 +1590,14 @@ Status BlockCacheTraceAnalyzer::Analyze() { " seconds. Observed miss ratio %.2f\n", duration, duration > 0 ? access_sequence_number_ / duration : 0, trace_duration, miss_ratio_stats_.miss_ratio()); + + for (const auto& caller : caller_miss_ratio_stats_map_) { + fprintf(stdout, "Caller %s: Observed miss ratio %.2f\n", + caller_to_string(caller.first).c_str(), + caller.second.miss_ratio()); + } + print_break_lines(/*num_break_lines=*/1); + time_interval++; } } @@ -1599,6 +1611,11 @@ Status BlockCacheTraceAnalyzer::Analyze() { " seconds. Observed miss ratio %.2f\n", duration, duration > 0 ? access_sequence_number_ / duration : 0, trace_duration, miss_ratio_stats_.miss_ratio()); + for (const auto& caller : caller_miss_ratio_stats_map_) { + fprintf(stdout, "Caller %s: Observed miss ratio %.2f\n", + caller_to_string(caller.first).c_str(), caller.second.miss_ratio()); + } + print_break_lines(/*num_break_lines=*/1); return s; } diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/tools/block_cache_analyzer/block_cache_trace_analyzer.h index 2f1ebd139ba..a7842d1d4a2 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer.h +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.h @@ -387,6 +387,7 @@ class BlockCacheTraceAnalyzer { uint64_t trace_start_timestamp_in_seconds_ = 0; uint64_t trace_end_timestamp_in_seconds_ = 0; MissRatioStats miss_ratio_stats_; + std::map caller_miss_ratio_stats_map_; uint64_t unique_block_id_ = 1; uint64_t unique_get_key_id_ = 1; BlockCacheHumanReadableTraceWriter human_readable_trace_writer_; diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc index 174565641f7..77a6d1b2bb3 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc @@ -492,7 +492,7 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { ASSERT_EQ(20, ParseDouble(percent)); } ASSERT_EQ(expected_callers.size(), callers.size()); - for (auto caller : callers) { + for (const auto& caller : callers) { ASSERT_TRUE(expected_callers.find(caller) != expected_callers.end()); } ASSERT_OK(env_->DeleteFile(percent_access_summary_file)); @@ -504,7 +504,7 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { std::string caller; ASSERT_TRUE(getline(analyzing_callers, caller, ',')); std::vector breakdowns{"level", "bt"}; - for (auto breakdown : breakdowns) { + for (const auto& breakdown : breakdowns) { const std::string file_name = test_path_ + "/" + caller + "_" + breakdown + "_percentage_of_accesses_summary"; @@ -554,7 +554,7 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { } for (auto const& access_type : access_types) { std::vector block_types{"Index", "Data", "Filter"}; - for (auto block_type : block_types) { + for (const auto& block_type : block_types) { // Validate reuse block timeline. const std::string reuse_blocks_timeline = test_path_ + "/" + block_type + "_" + access_type + diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh index 93b51a9b9e9..fc00556ecd0 100755 --- a/tools/check_format_compatible.sh +++ b/tools/check_format_compatible.sh @@ -125,7 +125,7 @@ EOF # To check for DB forward compatibility with loading options (old version # reading data from new), as well as backward compatibility -declare -a db_forward_with_options_refs=("6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb" "7.10.fb" "8.0.fb" "8.1.fb" "8.2.fb" "8.3.fb" "8.4.fb" "8.5.fb" "8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb") +declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb") # To check for DB forward compatibility without loading options (in addition # to the "with loading options" set), as well as backward compatibility declare -a db_forward_no_options_refs=() # N/A at the moment @@ -133,7 +133,7 @@ declare -a db_forward_no_options_refs=() # N/A at the moment # To check for SST ingestion backward compatibility (new version reading # data from old) (ldb ingest_extern_sst added in 5.16.x, back-ported to # 5.14.x, 5.15.x) -declare -a ext_backward_only_refs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb" "6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb" "6.14.fb" "6.15.fb" "6.16.fb" "6.17.fb" "6.18.fb" "6.19.fb" "6.20.fb" "6.21.fb" "6.22.fb" "6.23.fb" "6.24.fb" "6.25.fb" "6.26.fb") +declare -a ext_backward_only_refs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb" "6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb" "6.14.fb" "6.15.fb" "6.16.fb" "6.17.fb" "6.18.fb" "6.19.fb" "6.20.fb" "6.21.fb" "6.22.fb" "6.23.fb" "6.24.fb" "6.25.fb" "6.26.fb" "6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb" "7.10.fb" "8.0.fb" "8.1.fb" "8.2.fb" "8.3.fb" "8.4.fb" "8.5.fb") # To check for SST ingestion forward compatibility (old version reading # data from new) as well as backward compatibility declare -a ext_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}") diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 340a8a3a1e2..d47ffb5385f 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -15,9 +15,10 @@ #include #endif #include -#include -#include #include + +#include +#include #ifdef __APPLE__ #include #include @@ -594,6 +595,12 @@ static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compressed_secondary_cache_compression_type_e = ROCKSDB_NAMESPACE::kLZ4Compression; +DEFINE_int32(compressed_secondary_cache_compression_level, + ROCKSDB_NAMESPACE::CompressionOptions().level, + "Compression level. The meaning of this value is library-" + "dependent. If unset, we try to use the default for the library " + "specified in `--compressed_secondary_cache_compression_type`"); + DEFINE_uint32( compressed_secondary_cache_compress_format_version, 2, "compress_format_version can have two values: " @@ -602,12 +609,17 @@ DEFINE_uint32( "compress_format_version == 2 -- decompressed size is included" " in the block header in varint32 format."); -DEFINE_bool(use_tiered_volatile_cache, false, +DEFINE_bool(use_tiered_cache, false, "If use_compressed_secondary_cache is true and " "use_tiered_volatile_cache is true, then allocate a tiered cache " "that distributes cache reservations proportionally over both " "the caches."); +DEFINE_string( + tiered_adm_policy, "auto", + "Admission policy to use for the secondary cache(s) in the tiered cache. " + "Allowed values are auto, placeholder, allow_cache_hits, and three_queue."); + DEFINE_int64(simcache_size, -1, "Number of bytes to use as a simcache of " "uncompressed data. Nagative value disables simcache."); @@ -918,11 +930,6 @@ DEFINE_bool(force_consistency_checks, "Runs consistency checks on the LSM every time a change is " "applied."); -DEFINE_bool(check_flush_compaction_key_order, - ROCKSDB_NAMESPACE::Options().check_flush_compaction_key_order, - "During flush or compaction, check whether keys inserted to " - "output files are in order."); - DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Ignored. Left here for backward compatibility"); @@ -1253,28 +1260,46 @@ static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType( const char* ctype) { assert(ctype); - if (!strcasecmp(ctype, "none")) + if (!strcasecmp(ctype, "none")) { return ROCKSDB_NAMESPACE::kNoCompression; - else if (!strcasecmp(ctype, "snappy")) + } else if (!strcasecmp(ctype, "snappy")) { return ROCKSDB_NAMESPACE::kSnappyCompression; - else if (!strcasecmp(ctype, "zlib")) + } else if (!strcasecmp(ctype, "zlib")) { return ROCKSDB_NAMESPACE::kZlibCompression; - else if (!strcasecmp(ctype, "bzip2")) + } else if (!strcasecmp(ctype, "bzip2")) { return ROCKSDB_NAMESPACE::kBZip2Compression; - else if (!strcasecmp(ctype, "lz4")) + } else if (!strcasecmp(ctype, "lz4")) { return ROCKSDB_NAMESPACE::kLZ4Compression; - else if (!strcasecmp(ctype, "lz4hc")) + } else if (!strcasecmp(ctype, "lz4hc")) { return ROCKSDB_NAMESPACE::kLZ4HCCompression; - else if (!strcasecmp(ctype, "xpress")) + } else if (!strcasecmp(ctype, "xpress")) { return ROCKSDB_NAMESPACE::kXpressCompression; - else if (!strcasecmp(ctype, "zstd")) + } else if (!strcasecmp(ctype, "zstd")) { return ROCKSDB_NAMESPACE::kZSTD; - else { + } else { fprintf(stderr, "Cannot parse compression type '%s'\n", ctype); exit(1); } } +static enum ROCKSDB_NAMESPACE::TieredAdmissionPolicy StringToAdmissionPolicy( + const char* policy) { + assert(policy); + + if (!strcasecmp(policy, "auto")) { + return ROCKSDB_NAMESPACE::kAdmPolicyAuto; + } else if (!strcasecmp(policy, "placeholder")) { + return ROCKSDB_NAMESPACE::kAdmPolicyPlaceholder; + } else if (!strcasecmp(policy, "allow_cache_hits")) { + return ROCKSDB_NAMESPACE::kAdmPolicyAllowCacheHits; + } else if (!strcasecmp(policy, "three_queue")) { + return ROCKSDB_NAMESPACE::kAdmPolicyThreeQueue; + } else { + fprintf(stderr, "Cannot parse admission policy %s\n", policy); + exit(1); + } +} + static std::string ColumnFamilyName(size_t i) { if (i == 0) { return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName; @@ -1446,6 +1471,9 @@ DEFINE_bool(rate_limiter_auto_tuned, false, "Enable dynamic adjustment of rate limit according to demand for " "background I/O"); +DEFINE_int64(rate_limiter_single_burst_bytes, 0, + "Set single burst bytes on background I/O rate limiter."); + DEFINE_bool(sine_write_rate, false, "Use a sine wave write_rate_limit"); DEFINE_uint64( @@ -1567,11 +1595,6 @@ DEFINE_bool(advise_random_on_open, ROCKSDB_NAMESPACE::Options().advise_random_on_open, "Advise random access on table file open"); -DEFINE_string(compaction_fadvice, "NORMAL", - "Access pattern advice when a file is compacted"); -static auto FLAGS_compaction_fadvice_e = - ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start; - DEFINE_bool(use_tailing_iterator, false, "Use tailing iterator to access a series of keys instead of get"); @@ -1618,6 +1641,10 @@ DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge operations on a key in the " "memtable"); +DEFINE_bool(strict_max_successive_merges, false, + "Whether to issue filesystem reads to keep within " + "`max_successive_merges` limit"); + static bool ValidatePrefixSize(const char* flagname, int32_t value) { if (value < 0 || value >= 2000000000) { fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n", @@ -1783,12 +1810,13 @@ static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed; static enum DistributionType StringToDistributionType(const char* ctype) { assert(ctype); - if (!strcasecmp(ctype, "fixed")) + if (!strcasecmp(ctype, "fixed")) { return kFixed; - else if (!strcasecmp(ctype, "uniform")) + } else if (!strcasecmp(ctype, "uniform")) { return kUniform; - else if (!strcasecmp(ctype, "normal")) + } else if (!strcasecmp(ctype, "normal")) { return kNormal; + } fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype); exit(1); @@ -1798,7 +1826,7 @@ class BaseDistribution { public: BaseDistribution(unsigned int _min, unsigned int _max) : min_value_size_(_min), max_value_size_(_max) {} - virtual ~BaseDistribution() {} + virtual ~BaseDistribution() = default; unsigned int Generate() { auto val = Get(); @@ -1822,8 +1850,8 @@ class FixedDistribution : public BaseDistribution { : BaseDistribution(size, size), size_(size) {} private: - virtual unsigned int Get() override { return size_; } - virtual bool NeedTruncate() override { return false; } + unsigned int Get() override { return size_; } + bool NeedTruncate() override { return false; } unsigned int size_; }; @@ -1839,7 +1867,7 @@ class NormalDistribution : public BaseDistribution, gen_(rd_()) {} private: - virtual unsigned int Get() override { + unsigned int Get() override { return static_cast((*this)(gen_)); } std::random_device rd_; @@ -1855,8 +1883,8 @@ class UniformDistribution : public BaseDistribution, gen_(rd_()) {} private: - virtual unsigned int Get() override { return (*this)(gen_); } - virtual bool NeedTruncate() override { return false; } + unsigned int Get() override { return (*this)(gen_); } + bool NeedTruncate() override { return false; } std::random_device rd_; std::mt19937 gen_; }; @@ -1915,7 +1943,9 @@ class RandomGenerator { }; static void AppendWithSpace(std::string* str, Slice msg) { - if (msg.empty()) return; + if (msg.empty()) { + return; + } if (!str->empty()) { str->push_back(' '); } @@ -2169,7 +2199,9 @@ class Stats { } void Merge(const Stats& other) { - if (other.exclude_from_merge_) return; + if (other.exclude_from_merge_) { + return; + } for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) { auto this_it = hist_.find(it->first); @@ -2183,11 +2215,17 @@ class Stats { done_ += other.done_; bytes_ += other.bytes_; seconds_ += other.seconds_; - if (other.start_ < start_) start_ = other.start_; - if (other.finish_ > finish_) finish_ = other.finish_; + if (other.start_ < start_) { + start_ = other.start_; + } + if (other.finish_ > finish_) { + finish_ = other.finish_; + } // Just keep the messages from one thread. - if (message_.empty()) message_ = other.message_; + if (message_.empty()) { + message_ = other.message_; + } } void Stop() { @@ -2266,20 +2304,21 @@ class Stats { done_ += num_ops; if (done_ >= next_report_ && FLAGS_progress_reports) { if (!FLAGS_stats_interval) { - if (next_report_ < 1000) + if (next_report_ < 1000) { next_report_ += 100; - else if (next_report_ < 5000) + } else if (next_report_ < 5000) { next_report_ += 500; - else if (next_report_ < 10000) + } else if (next_report_ < 10000) { next_report_ += 1000; - else if (next_report_ < 50000) + } else if (next_report_ < 50000) { next_report_ += 5000; - else if (next_report_ < 100000) + } else if (next_report_ < 100000) { next_report_ += 10000; - else if (next_report_ < 500000) + } else if (next_report_ < 500000) { next_report_ += 50000; - else + } else { next_report_ += 100000; + } fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, ""); } else { uint64_t now = clock_->NowMicros(); @@ -2311,8 +2350,9 @@ class Stats { if (db_with_cfh && db_with_cfh->num_created.load()) { for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) { if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats", - &stats)) + &stats)) { fprintf(stderr, "%s\n", stats.c_str()); + } if (FLAGS_show_table_properties) { for (int level = 0; level < FLAGS_num_levels; ++level) { if (db->GetProperty( @@ -2370,7 +2410,9 @@ class Stats { void Report(const Slice& name) { // Pretend at least one op was done in case we are running a benchmark // that does not call FinishedOps(). - if (done_ < 1) done_ = 1; + if (done_ < 1) { + done_ = 1; + } std::string extra; double elapsed = (finish_ - start_) * 1e-6; @@ -2635,7 +2677,9 @@ class Duration { int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; } bool Done(int64_t increment) { - if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops + if (increment <= 0) { + increment = 1; // avoid Done(0) and infinite loops + } ops_ += increment; if (max_seconds_) { @@ -2702,7 +2746,7 @@ class Benchmark { no_auto_recovery_(false), recovery_complete_(false) {} - ~ErrorHandlerListener() override {} + ~ErrorHandlerListener() override = default; const char* Name() const override { return kClassName(); } static const char* kClassName() { return "ErrorHandlerListener"; } @@ -3022,6 +3066,7 @@ class Benchmark { static std::shared_ptr NewCache(int64_t capacity) { CompressedSecondaryCacheOptions secondary_cache_opts; + TieredAdmissionPolicy adm_policy = TieredAdmissionPolicy::kAdmPolicyAuto; bool use_tiered_cache = false; if (capacity <= 0) { return nullptr; @@ -3036,12 +3081,34 @@ class Benchmark { FLAGS_compressed_secondary_cache_low_pri_pool_ratio; secondary_cache_opts.compression_type = FLAGS_compressed_secondary_cache_compression_type_e; + secondary_cache_opts.compression_opts.level = + FLAGS_compressed_secondary_cache_compression_level; secondary_cache_opts.compress_format_version = FLAGS_compressed_secondary_cache_compress_format_version; - if (FLAGS_use_tiered_volatile_cache) { + if (FLAGS_use_tiered_cache) { use_tiered_cache = true; + adm_policy = StringToAdmissionPolicy(FLAGS_tiered_adm_policy.c_str()); } } + if (!FLAGS_secondary_cache_uri.empty()) { + if (!use_tiered_cache && FLAGS_use_compressed_secondary_cache) { + fprintf( + stderr, + "Cannot specify both --secondary_cache_uri and " + "--use_compressed_secondary_cache when using a non-tiered cache\n"); + exit(1); + } + Status s = SecondaryCache::CreateFromString( + ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache); + if (secondary_cache == nullptr) { + fprintf(stderr, + "No secondary cache registered matching string: %s status=%s\n", + FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str()); + exit(1); + } + } + + std::shared_ptr block_cache; if (FLAGS_cache_type == "clock_cache") { fprintf(stderr, "Old clock cache implementation has been removed.\n"); exit(1); @@ -3061,13 +3128,24 @@ class Benchmark { opts.hash_seed = GetCacheHashSeed(); if (use_tiered_cache) { TieredCacheOptions tiered_opts; - opts.capacity += secondary_cache_opts.capacity; tiered_opts.cache_type = PrimaryCacheType::kCacheTypeHCC; tiered_opts.cache_opts = &opts; + tiered_opts.total_capacity = + opts.capacity + secondary_cache_opts.capacity; + tiered_opts.compressed_secondary_ratio = + secondary_cache_opts.capacity * 1.0 / tiered_opts.total_capacity; tiered_opts.comp_cache_opts = secondary_cache_opts; - return NewTieredCache(tiered_opts); + tiered_opts.nvm_sec_cache = secondary_cache; + tiered_opts.adm_policy = adm_policy; + block_cache = NewTieredCache(tiered_opts); } else { - return opts.MakeSharedCache(); + if (!FLAGS_secondary_cache_uri.empty()) { + opts.secondary_cache = secondary_cache; + } else if (FLAGS_use_compressed_secondary_cache) { + opts.secondary_cache = + NewCompressedSecondaryCache(secondary_cache_opts); + } + block_cache = opts.MakeSharedCache(); } } else if (FLAGS_cache_type == "lru_cache") { LRUCacheOptions opts( @@ -3076,36 +3154,37 @@ class Benchmark { GetCacheAllocator(), kDefaultToAdaptiveMutex, kDefaultCacheMetadataChargePolicy, FLAGS_cache_low_pri_pool_ratio); opts.hash_seed = GetCacheHashSeed(); - if (!FLAGS_secondary_cache_uri.empty()) { - Status s = SecondaryCache::CreateFromString( - ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache); - if (secondary_cache == nullptr) { - fprintf( - stderr, - "No secondary cache registered matching string: %s status=%s\n", - FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str()); - exit(1); - } - opts.secondary_cache = secondary_cache; - } else if (FLAGS_use_compressed_secondary_cache && !use_tiered_cache) { - opts.secondary_cache = - NewCompressedSecondaryCache(secondary_cache_opts); - } - if (use_tiered_cache) { TieredCacheOptions tiered_opts; - opts.capacity += secondary_cache_opts.capacity; tiered_opts.cache_type = PrimaryCacheType::kCacheTypeLRU; tiered_opts.cache_opts = &opts; + tiered_opts.total_capacity = + opts.capacity + secondary_cache_opts.capacity; + tiered_opts.compressed_secondary_ratio = + secondary_cache_opts.capacity * 1.0 / tiered_opts.total_capacity; tiered_opts.comp_cache_opts = secondary_cache_opts; - return NewTieredCache(tiered_opts); + tiered_opts.nvm_sec_cache = secondary_cache; + tiered_opts.adm_policy = adm_policy; + block_cache = NewTieredCache(tiered_opts); } else { - return opts.MakeSharedCache(); + if (!FLAGS_secondary_cache_uri.empty()) { + opts.secondary_cache = secondary_cache; + } else if (FLAGS_use_compressed_secondary_cache) { + opts.secondary_cache = + NewCompressedSecondaryCache(secondary_cache_opts); + } + block_cache = opts.MakeSharedCache(); } } else { fprintf(stderr, "Cache type not supported."); exit(1); } + + if (!block_cache) { + fprintf(stderr, "Unable to allocate block cache\n"); + exit(1); + } + return block_cache; } public: @@ -3836,7 +3915,7 @@ class Benchmark { }; static void ThreadBody(void* v) { - ThreadArg* arg = reinterpret_cast(v); + ThreadArg* arg = static_cast(v); SharedState* shared = arg->shared; ThreadState* thread = arg->thread; { @@ -4000,7 +4079,9 @@ class Benchmark { count++; thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers); } - if (ptr == nullptr) exit(1); // Disable unused variable warning. + if (ptr == nullptr) { + exit(1); // Disable unused variable warning. + } } void Compress(ThreadState* thread) { @@ -4489,7 +4570,7 @@ class Benchmark { FLAGS_level0_slowdown_writes_trigger; options.compression = FLAGS_compression_type_e; if (FLAGS_simulate_hybrid_fs_file != "") { - options.bottommost_temperature = Temperature::kWarm; + options.last_level_temperature = Temperature::kWarm; } options.preclude_last_level_data_seconds = FLAGS_preclude_last_level_data_seconds; @@ -4533,13 +4614,10 @@ class Benchmark { options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits; options.paranoid_checks = FLAGS_paranoid_checks; options.force_consistency_checks = FLAGS_force_consistency_checks; - options.check_flush_compaction_key_order = - FLAGS_check_flush_compaction_key_order; options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds; options.ttl = FLAGS_ttl_seconds; // fill storage options options.advise_random_on_open = FLAGS_advise_random_on_open; - options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e; options.use_adaptive_mutex = FLAGS_use_adaptive_mutex; options.bytes_per_sync = FLAGS_bytes_per_sync; options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync; @@ -4555,6 +4633,7 @@ class Benchmark { } } options.max_successive_merges = FLAGS_max_successive_merges; + options.strict_max_successive_merges = FLAGS_strict_max_successive_merges; options.report_bg_io_stats = FLAGS_report_bg_io_stats; // set universal style compaction configurations, if applicable @@ -4707,7 +4786,8 @@ class Benchmark { // Get()/MultiGet() FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly : RateLimiter::Mode::kWritesOnly, - FLAGS_rate_limiter_auto_tuned)); + FLAGS_rate_limiter_auto_tuned, + FLAGS_rate_limiter_single_burst_bytes)); } } @@ -4780,8 +4860,8 @@ class Benchmark { } std::vector column_families; for (size_t i = 0; i < num_hot; i++) { - column_families.push_back(ColumnFamilyDescriptor( - ColumnFamilyName(i), ColumnFamilyOptions(options))); + column_families.emplace_back(ColumnFamilyName(i), + ColumnFamilyOptions(options)); } std::vector cfh_idx_to_prob; if (!FLAGS_column_family_distribution.empty()) { @@ -5604,7 +5684,7 @@ class Benchmark { auto total_size = meta.levels[0].size; if (total_size >= db->GetOptions().compaction_options_fifo.max_table_files_size) { - for (auto file_meta : meta.levels[0].files) { + for (const auto& file_meta : meta.levels[0].files) { file_names.emplace_back(file_meta.name); } break; @@ -5655,7 +5735,7 @@ class Benchmark { SequenceNumber sorted_run_largest_seqno = 0; std::string sorted_run_smallest_key, sorted_run_largest_key; bool first_key = true; - for (auto fileMeta : sorted_runs[k][i]) { + for (const auto& fileMeta : sorted_runs[k][i]) { sorted_run_smallest_seqno = std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno); sorted_run_largest_seqno = @@ -5676,7 +5756,7 @@ class Benchmark { (compaction_style == kCompactionStyleUniversal && level > 0)) { SequenceNumber level_smallest_seqno = kMaxSequenceNumber; SequenceNumber level_largest_seqno = 0; - for (auto fileMeta : meta.levels[level].files) { + for (const auto& fileMeta : meta.levels[level].files) { level_smallest_seqno = std::min(level_smallest_seqno, fileMeta.smallest_seqno); level_largest_seqno = @@ -6198,8 +6278,8 @@ class Benchmark { GenerateKeyFromInt(lkey, FLAGS_num, &lkeys[i]); GenerateKeyFromInt(rkey, FLAGS_num, &rkeys[i]); } - db->GetApproximateSizes(&ranges[0], static_cast(entries_per_batch_), - &sizes[0]); + db->GetApproximateSizes( + ranges.data(), static_cast(entries_per_batch_), sizes.data()); num_sizes += entries_per_batch_; for (int64_t size : sizes) { size_sum += size; @@ -6252,8 +6332,8 @@ class Benchmark { std::vector ratio_; int range_; - QueryDecider() {} - ~QueryDecider() {} + QueryDecider() = default; + ~QueryDecider() = default; Status Initiate(std::vector ratio_input) { int range_max = 1000; @@ -7596,7 +7676,9 @@ class Benchmark { thread->stats.FinishedOps(nullptr, db, 1, kMerge); } else { Status s = db->Get(read_options_, key, &value); - if (value.length() > max_length) max_length = value.length(); + if (value.length() > max_length) { + max_length = value.length(); + } if (!s.ok() && !s.IsNotFound()) { fprintf(stderr, "get error: %s\n", s.ToString().c_str()); @@ -7661,10 +7743,16 @@ class Benchmark { } bool binary_search(std::vector& data, int start, int end, int key) { - if (data.empty()) return false; - if (start > end) return false; + if (data.empty()) { + return false; + } + if (start > end) { + return false; + } int mid = start + (end - start) / 2; - if (mid > static_cast(data.size()) - 1) return false; + if (mid > static_cast(data.size()) - 1) { + return false; + } if (data[mid] == key) { return true; } else if (data[mid] > key) { @@ -7737,7 +7825,9 @@ class Benchmark { found = binary_search(data, 0, static_cast(data.size() - 1), lookup_key); data.clear(); - if (found) break; + if (found) { + break; + } } std::cout << "Found key? " << std::to_string(found) << "\n"; sp = FLAGS_env->NowNanos(); @@ -7747,7 +7837,9 @@ class Benchmark { std::cout << "Sample data from GetMergeOperands API call: "; for (PinnableSlice& psl : a_slice) { std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n"; - if (to_print++ > 2) break; + if (to_print++ > 2) { + break; + } } } @@ -7829,7 +7921,7 @@ class Benchmark { if (FLAGS_optimistic_transaction_db) { success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db); } else if (FLAGS_transaction_db) { - TransactionDB* txn_db = reinterpret_cast(db_.db); + TransactionDB* txn_db = static_cast(db_.db); success = inserter.TransactionDBInsert(txn_db, txn_options); } else { success = inserter.DBInsert(db_.db); @@ -8161,7 +8253,9 @@ class Benchmark { real_from_level = std::numeric_limits::max(); for (auto& f : files) { - if (f.level > 0 && f.level < real_from_level) real_from_level = f.level; + if (f.level > 0 && f.level < real_from_level) { + real_from_level = f.level; + } } if (real_from_level == std::numeric_limits::max()) { @@ -8177,10 +8271,11 @@ class Benchmark { std::vector files_to_compact; for (auto& f : files) { - if (f.level == real_from_level) + if (f.level == real_from_level) { files_to_compact.push_back(f.name); - else if (f.level > real_from_level && f.level < next_level) + } else if (f.level > real_from_level && f.level < next_level) { next_level = f.level; + } } if (files_to_compact.empty()) { @@ -8221,10 +8316,14 @@ class Benchmark { void CompactLevel(int from_level) { if (db_.db != nullptr) { - while (!CompactLevelHelper(db_, from_level)) WaitForCompaction(); + while (!CompactLevelHelper(db_, from_level)) { + WaitForCompaction(); + } } for (auto& db_with_cfh : multi_dbs_) { - while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction(); + while (!CompactLevelHelper(db_with_cfh, from_level)) { + WaitForCompaction(); + } } } @@ -8558,20 +8657,6 @@ int db_bench_tool(int argc, char** argv) { exit(1); } - if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE")) - FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NONE; - else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL")) - FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NORMAL; - else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL")) - FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::SEQUENTIAL; - else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED")) - FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::WILLNEED; - else { - fprintf(stdout, "Unknown compaction fadvice:%s\n", - FLAGS_compaction_fadvice.c_str()); - exit(1); - } - FLAGS_value_size_distribution_type_e = StringToDistributionType(FLAGS_value_size_distribution_type.c_str()); diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc index a30c650654f..1668dfb8836 100644 --- a/tools/db_bench_tool_test.cc +++ b/tools/db_bench_tool_test.cc @@ -130,7 +130,7 @@ namespace {} // namespace TEST_F(DBBenchTest, OptionsFile) { const std::string kOptionsFileName = test_path_ + "/OPTIONS_test"; Options opt = GetDefaultOptions(); - ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"}, + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), DBOptions(opt), {"default"}, {ColumnFamilyOptions(opt)}, kOptionsFileName, opt.env->GetFileSystem().get())); @@ -149,7 +149,7 @@ TEST_F(DBBenchTest, OptionsFileUniversal) { Options opt = GetDefaultOptions(kCompactionStyleUniversal, 1); - ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"}, + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), DBOptions(opt), {"default"}, {ColumnFamilyOptions(opt)}, kOptionsFileName, opt.env->GetFileSystem().get())); @@ -166,7 +166,7 @@ TEST_F(DBBenchTest, OptionsFileMultiLevelUniversal) { Options opt = GetDefaultOptions(kCompactionStyleUniversal, 12); - ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"}, + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), DBOptions(opt), {"default"}, {ColumnFamilyOptions(opt)}, kOptionsFileName, opt.env->GetFileSystem().get())); diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 01c3ae329e0..3fbd5729c2c 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -31,10 +31,10 @@ default_params = { - "acquire_snapshot_one_in": 10000, + "acquire_snapshot_one_in": lambda: random.choice([100, 10000]), "backup_max_size": 100 * 1024 * 1024, # Consider larger number when backups considered more stable - "backup_one_in": 100000, + "backup_one_in": lambda: random.choice([1000, 100000]), "batch_protection_bytes_per_key": lambda: random.choice([0, 8]), "memtable_protection_bytes_per_key": lambda: random.choice([0, 1, 2, 4, 8]), "block_protection_bytes_per_key": lambda: random.choice([0, 1, 2, 4, 8]), @@ -48,7 +48,7 @@ "charge_filter_construction": lambda: random.choice([0, 1]), "charge_table_reader": lambda: random.choice([0, 1]), "charge_file_metadata": lambda: random.choice([0, 1]), - "checkpoint_one_in": 1000000, + "checkpoint_one_in": lambda: random.choice([10000, 1000000]), "compression_type": lambda: random.choice( ["none", "snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"] ), @@ -67,8 +67,8 @@ "compression_use_zstd_dict_trainer": lambda: random.randint(0, 1), "compression_checksum": lambda: random.randint(0, 1), "clear_column_family_one_in": 0, - "compact_files_one_in": 1000000, - "compact_range_one_in": 1000000, + "compact_files_one_in": lambda: random.choice([1000, 1000000]), + "compact_range_one_in": lambda: random.choice([1000, 1000000]), "compaction_pri": random.randint(0, 4), "data_block_index_type": lambda: random.choice([0, 1]), "delpercent": 4, @@ -78,23 +78,25 @@ "enable_compaction_filter": lambda: random.choice([0, 0, 0, 1]), "expected_values_dir": lambda: setup_expected_values_dir(), "fail_if_options_file_error": lambda: random.randint(0, 1), - "flush_one_in": 1000000, - "manual_wal_flush_one_in": lambda: random.choice([0, 0, 1000, 1000000]), + "flush_one_in": lambda: random.choice([1000, 1000000]), + "manual_wal_flush_one_in": lambda: random.choice([0, 1000]), "file_checksum_impl": lambda: random.choice(["none", "crc32c", "xxh64", "big"]), - "get_live_files_one_in": 1000000, + "get_live_files_one_in": lambda: random.choice([10000, 1000000]), # Note: the following two are intentionally disabled as the corresponding # APIs are not guaranteed to succeed. "get_sorted_wal_files_one_in": 0, "get_current_wal_file_one_in": 0, # Temporarily disable hash index "index_type": lambda: random.choice([0, 0, 0, 2, 2, 3]), - "ingest_external_file_one_in": 1000000, + "ingest_external_file_one_in": lambda: random.choice([1000, 1000000]), "iterpercent": 10, - "lock_wal_one_in": 1000000, + "lock_wal_one_in": lambda: random.choice([10000, 1000000]), "mark_for_compaction_one_file_in": lambda: 10 * random.randint(0, 1), "max_background_compactions": 20, "max_bytes_for_level_base": 10485760, - "max_key": 25000000, + # max_key has to be the same across invocations for verification to work, hence no lambda + "max_key": random.choice([100000, 25000000]), + "max_sequential_skip_in_iterations": lambda: random.choice([1, 2, 8, 16]), "max_write_buffer_number": 3, "mmap_read": lambda: random.randint(0, 1), # Setting `nooverwritepercent > 0` is only possible because we do not vary @@ -105,7 +107,7 @@ "optimize_filters_for_memory": lambda: random.randint(0, 1), "partition_filters": lambda: random.randint(0, 1), "partition_pinning": lambda: random.randint(0, 3), - "pause_background_one_in": 1000000, + "pause_background_one_in": lambda: random.choice([10000, 1000000]), "prefix_size": lambda: random.choice([-1, 1, 5, 7, 8]), "prefixpercent": 5, "progress_reports": 0, @@ -116,7 +118,7 @@ "sst_file_manager_bytes_per_truncate": lambda: random.choice([0, 1048576]), "long_running_snapshots": lambda: random.randint(0, 1), "subcompactions": lambda: random.randint(1, 4), - "target_file_size_base": 2097152, + "target_file_size_base": lambda: random.choice([512 * 1024, 2048 * 1024]), "target_file_size_multiplier": 2, "test_batches_snapshots": random.randint(0, 1), "top_level_index_pinning": lambda: random.randint(0, 3), @@ -139,7 +141,7 @@ "value_size_mult": 32, "verification_only": 0, "verify_checksum": 1, - "write_buffer_size": 4 * 1024 * 1024, + "write_buffer_size": lambda: random.choice([1024 * 1024, 4 * 1024 * 1024]), "writepercent": 35, "format_version": lambda: random.choice([2, 3, 4, 5, 6, 6]), "index_block_restart_interval": lambda: random.choice(range(1, 16)), @@ -175,9 +177,9 @@ [16, 64, 1024 * 1024, 16 * 1024 * 1024] ), "level_compaction_dynamic_level_bytes": lambda: random.randint(0, 1), - "verify_checksum_one_in": 1000000, - "verify_file_checksums_one_in": 1000000, - "verify_db_one_in": 100000, + "verify_checksum_one_in": lambda: random.choice([100000, 1000000]), + "verify_file_checksums_one_in": lambda: random.choice([100000, 1000000]), + "verify_db_one_in": lambda: random.choice([10000, 100000]), "continuous_verification_interval": 0, "max_key_len": 3, "key_len_percent_dist": "1,30,69", @@ -187,7 +189,7 @@ "open_write_fault_one_in": lambda: random.choice([0, 0, 16]), "open_read_fault_one_in": lambda: random.choice([0, 0, 32]), "sync_fault_injection": lambda: random.randint(0, 1), - "get_property_one_in": 1000000, + "get_property_one_in": lambda: random.choice([100000, 1000000]), "paranoid_file_checks": lambda: random.choice([0, 1, 1, 1]), "max_write_buffer_size_to_maintain": lambda: random.choice( [0, 1024 * 1024, 2 * 1024 * 1024, 4 * 1024 * 1024, 8 * 1024 * 1024] @@ -226,8 +228,47 @@ ), "auto_readahead_size" : lambda: random.choice([0, 1]), "verify_iterator_with_expected_state_one_in": 5, + "allow_fallocate": lambda: random.choice([0, 1]), + "table_cache_numshardbits": lambda: random.choice([6] * 3 + [-1] * 2 + [0]), + "enable_write_thread_adaptive_yield": lambda: random.choice([0, 1]), + "log_readahead_size": lambda: random.choice([0, 16 * 1024 * 1024]), + "bgerror_resume_retry_interval": lambda: random.choice([10000, 1000000]), + "delete_obsolete_files_period_micros": lambda: random.choice([6 * 60 * 60 * 1000000, 30 * 1000000]), + "max_log_file_size": lambda: random.choice([0, 1024 * 1024]), + "log_file_time_to_roll": lambda: random.choice([0, 60]), + "use_adaptive_mutex": lambda: random.choice([0, 1]), + "advise_random_on_open": lambda: random.choice([0] + [1] * 3), + "WAL_ttl_seconds": lambda: random.choice([0, 60]), + "WAL_size_limit_MB": lambda: random.choice([0, 1]), + "wal_bytes_per_sync": lambda: random.choice([0, 1024 * 1024]), + "strict_bytes_per_sync": lambda: random.choice([0, 1]), + "avoid_flush_during_shutdown": lambda: random.choice([0, 1]), + "fill_cache": lambda: random.choice([0, 1]), + "optimize_multiget_for_io": lambda: random.choice([0, 1]), + "memtable_insert_hint_per_batch": lambda: random.choice([0, 1]), + "dump_malloc_stats": lambda: random.choice([0, 1]), + "stats_history_buffer_size": lambda: random.choice([0, 1024 * 1024]), + "skip_stats_update_on_db_open": lambda: random.choice([0, 1]), + "optimize_filters_for_hits": lambda: random.choice([0, 1]), + "sample_for_compression": lambda: random.choice([0, 5]), + "report_bg_io_stats": lambda: random.choice([0, 1]), + "cache_index_and_filter_blocks_with_high_priority": lambda: random.choice([0, 1]), + "use_delta_encoding": lambda: random.choice([0, 1]), + "verify_compression": lambda: random.choice([0, 1]), + "read_amp_bytes_per_bit": lambda: random.choice([0, 32]), + "enable_index_compression": lambda: random.choice([0, 1]), + "index_shortening": lambda: random.choice([0, 1, 2]), + "metadata_charge_policy": lambda: random.choice([0, 1]), + "use_adaptive_mutex_lru": lambda: random.choice([0, 1]), + "compress_format_version": lambda: random.choice([1, 2]), + "manifest_preallocation_size": lambda: random.choice([0, 5 * 1024]), + "enable_checksum_handoff": 0, + "max_total_wal_size": lambda: random.choice([0] * 4 + [64 * 1024 * 1024]), + "high_pri_pool_ratio": lambda: random.choice([0, 0.5]), + "low_pri_pool_ratio": lambda: random.choice([0, 0.5]), + "soft_pending_compaction_bytes_limit" : lambda: random.choice([1024 * 1024] + [64 * 1073741824] * 4), + "hard_pending_compaction_bytes_limit" : lambda: random.choice([2 * 1024 * 1024] + [256 * 1073741824] * 4), } - _TEST_DIR_ENV_VAR = "TEST_TMPDIR" # If TEST_TMPDIR_EXPECTED is not specified, default value will be TEST_TMPDIR _TEST_EXPECTED_DIR_ENV_VAR = "TEST_TMPDIR_EXPECTED" @@ -424,7 +465,6 @@ def is_direct_io_supported(dbname): best_efforts_recovery_params = { "best_efforts_recovery": 1, - "atomic_flush": 0, "disable_wal": 1, "column_families": 1, "skip_verifydb": 1, @@ -458,6 +498,8 @@ def is_direct_io_supported(dbname): "test_cf_consistency": 0, "test_batches_snapshots": 0, "user_timestamp_size": 8, + # Below flag is randomly picked once and kept consistent in following runs. + "persist_user_defined_timestamps": random.choice([0, 1, 1]), "use_merge": 0, "use_full_merge_v1": 0, "use_txn": 0, @@ -549,9 +591,11 @@ def is_direct_io_supported(dbname): "use_only_the_last_commit_time_batch_for_recovery": 1, "clear_wp_commit_cache_one_in": 10, "create_timestamped_snapshot_one_in": 0, + # sequence number can be advanced in SwitchMemtable::WriteRecoverableState() for WP. + # disable it for now until we find another way to test LockWAL(). + "lock_wal_one_in": 0, } - def finalize_and_sanitize(src_params): dest_params = {k: v() if callable(v) else v for (k, v) in src_params.items()} if is_release_mode(): @@ -566,11 +610,6 @@ def finalize_and_sanitize(src_params): if dest_params["mmap_read"] == 1: dest_params["use_direct_io_for_flush_and_compaction"] = 0 dest_params["use_direct_reads"] = 0 - if dest_params["file_checksum_impl"] != "none": - # TODO(T109283569): there is a bug in `GenerateOneFileChecksum()`, - # used by `IngestExternalFile()`, causing it to fail with mmap - # reads. Remove this once it is fixed. - dest_params["ingest_external_file_one_in"] = 0 if ( dest_params["use_direct_io_for_flush_and_compaction"] == 1 or dest_params["use_direct_reads"] == 1 @@ -668,7 +707,6 @@ def finalize_and_sanitize(src_params): dest_params["enable_pipelined_write"] = 0 if dest_params.get("best_efforts_recovery") == 1: dest_params["disable_wal"] = 1 - dest_params["atomic_flush"] = 0 dest_params["enable_compaction_filter"] = 0 dest_params["sync"] = 0 dest_params["write_fault_one_in"] = 0 @@ -712,6 +750,31 @@ def finalize_and_sanitize(src_params): if (dest_params["cache_size"] <= 0 or dest_params["db_write_buffer_size"] <= 0): dest_params["use_write_buffer_manager"] = 0 + if dest_params["user_timestamp_size"] > 0 and dest_params["persist_user_defined_timestamps"] == 0: + # Features that are not compatible with UDT in memtable only feature. + dest_params["enable_blob_files"] = 0 + dest_params["allow_setting_blob_options_dynamically"] = 0 + dest_params["atomic_flush"] = 0 + dest_params["allow_concurrent_memtable_write"] = 0 + dest_params["block_protection_bytes_per_key"] = 0 + # TODO(yuzhangyu): make stress test logic handle this and enable testing + # these APIs. + # These operations need to compare side to side one operation with another. + # It's hard to guarantee their consistency because when timestamps can be + # collapsed, only operations using the same SuperVersion can be consistent + # with each other. There is no external APIs to ensure that. + dest_params["use_multiget"] = 0 + dest_params["use_multi_get_entity"] = 0 + dest_params["readpercent"] += dest_params.get("iterpercent", 10); + dest_params["iterpercent"] = 0 + # Only best efforts recovery test support disabling wal and + # disable atomic flush. + if dest_params["test_best_efforts_recovery"] == 0: + dest_params["disable_wal"] = 0 + if dest_params.get("disable_wal") == 1: + # disableWAL and recycle_log_file_num options are not mutually + # compatible at the moment + dest_params["recycle_log_file_num"] = 0 return dest_params @@ -789,6 +852,7 @@ def gen_cmd(params, unknown_params): "test_tiered_storage", "cleanup_cmd", "skip_tmpdir_check", + "print_stderr_separately" } and v is not None ] @@ -814,6 +878,27 @@ def execute_cmd(cmd, timeout=None): return hit_timeout, child.returncode, outs.decode("utf-8"), errs.decode("utf-8") +def print_output_and_exit_on_error(stdout, stderr, print_stderr_separately=False): + print("stdout:\n", stdout) + if len(stderr) == 0: + return + + if print_stderr_separately: + print("stderr:\n", stderr, file=sys.stderr) + else: + print("stderr:\n", stderr) + + sys.exit(2) + +def cleanup_after_success(dbname): + shutil.rmtree(dbname, True) + if cleanup_cmd is not None: + print("Running DB cleanup command - %s\n" % cleanup_cmd) + ret = os.system(cleanup_cmd) + if ret != 0: + print("TEST FAILED. DB cleanup returned error %d\n" % ret) + sys.exit(1) + # This script runs and kills db_stress multiple times. It checks consistency # in case of unsafe crashes in RocksDB. def blackbox_crash_main(args, unknown_args): @@ -840,27 +925,10 @@ def blackbox_crash_main(args, unknown_args): if not hit_timeout: print("Exit Before Killing") - print("stdout:") - print(outs) - print("stderr:") - print(errs) + print_output_and_exit_on_error(outs, errs, args.print_stderr_separately) sys.exit(2) - for line in errs.split("\n"): - if line != "" and not line.startswith("WARNING"): - print("stderr has error message:") - print("***" + line + "***") - - stderrdata = errs.lower() - errorcount = stderrdata.count("error") - stderrdata.count("got errors 0 times") - print("#times error occurred in output is " + str(errorcount) + "\n") - - if errorcount > 0: - print("TEST FAILED. Output has 'error'!!!\n") - sys.exit(2) - if stderrdata.find("fail") >= 0: - print("TEST FAILED. Output has 'fail'!!!\n") - sys.exit(2) + print_output_and_exit_on_error(outs, errs, args.print_stderr_separately) time.sleep(1) # time to stabilize before the next run @@ -877,27 +945,11 @@ def blackbox_crash_main(args, unknown_args): ) hit_timeout, retcode, outs, errs = execute_cmd(cmd) - # Print stats of the final run - print("stdout:", outs) - - for line in errs.split("\n"): - if line != "" and not line.startswith("WARNING"): - print("stderr has error message:") - print("***" + line + "***") - - stderrdata = errs.lower() - errorcount = stderrdata.count("error") - stderrdata.count("got errors 0 times") - print("#times error occurred in output is " + str(errorcount) + "\n") - - if errorcount > 0: - print("TEST FAILED. Output has 'error'!!!\n") - sys.exit(2) - if stderrdata.find("fail") >= 0: - print("TEST FAILED. Output has 'fail'!!!\n") - sys.exit(2) + # For the final run + print_output_and_exit_on_error(outs, errs, args.print_stderr_separately) # we need to clean up after ourselves -- only do this on test success - shutil.rmtree(dbname, True) + cleanup_after_success(dbname) # This python script runs db_stress multiple times. Some runs with @@ -922,6 +974,8 @@ def whitebox_crash_main(args, unknown_args): kill_random_test = cmd_params["random_kill_odd"] kill_mode = 0 prev_compaction_style = -1 + succeeded = True + hit_timeout = False while time.time() < exit_time: if check_mode == 0: additional_opts = { @@ -1036,49 +1090,29 @@ def whitebox_crash_main(args, unknown_args): ) print(msg) - print(stdoutdata) - print(stderrdata) + print_output_and_exit_on_error(stdoutdata, stderrdata, args.print_stderr_separately) if hit_timeout: print("Killing the run for running too long") break - expected = False + succeeded = False if additional_opts["kill_random_test"] is None and (retncode == 0): # we expect zero retncode if no kill option - expected = True + succeeded = True elif additional_opts["kill_random_test"] is not None and retncode <= 0: # When kill option is given, the test MIGHT kill itself. # If it does, negative retncode is expected. Otherwise 0. - expected = True + succeeded = True - if not expected: + if not succeeded: print("TEST FAILED. See kill option and exit code above!!!\n") sys.exit(1) - stderrdata = stderrdata.lower() - errorcount = stderrdata.count("error") - stderrdata.count("got errors 0 times") - print("#times error occurred in output is " + str(errorcount) + "\n") - - if errorcount > 0: - print("TEST FAILED. Output has 'error'!!!\n") - sys.exit(2) - if stderrdata.find("fail") >= 0: - print("TEST FAILED. Output has 'fail'!!!\n") - sys.exit(2) - # First half of the duration, keep doing kill test. For the next half, # try different modes. if time.time() > half_time: - # we need to clean up after ourselves -- only do this on test - # success - shutil.rmtree(dbname, True) - if cleanup_cmd is not None: - print("Running DB cleanup command - %s\n" % cleanup_cmd) - ret = os.system(cleanup_cmd) - if ret != 0: - print("TEST FAILED. DB cleanup returned error %d\n" % ret) - sys.exit(1) + cleanup_after_success(dbname) try: os.mkdir(dbname) except OSError: @@ -1092,6 +1126,12 @@ def whitebox_crash_main(args, unknown_args): time.sleep(1) # time to stabilize after a kill + # If successfully finished or timed out (we currently treat timed out test as passing) + # Clean up after ourselves + if succeeded or hit_timeout: + cleanup_after_success(dbname) + + def main(): global stress_cmd global cleanup_cmd @@ -1113,6 +1153,7 @@ def main(): parser.add_argument("--test_tiered_storage", action="store_true") parser.add_argument("--cleanup_cmd") parser.add_argument("--skip_tmpdir_check", action="store_true") + parser.add_argument("--print_stderr_separately", action="store_true", default=False) all_params = dict( list(default_params.items()) diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc index c424743d738..bfc78ab1e35 100644 --- a/tools/db_repl_stress.cc +++ b/tools/db_repl_stress.cc @@ -54,7 +54,7 @@ struct DataPumpThread { }; static void DataPumpThreadBody(void* arg) { - DataPumpThread* t = reinterpret_cast(arg); + DataPumpThread* t = static_cast(arg); DB* db = t->db; Random rnd(301); uint64_t i = 0; diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc index f40be5ae2f2..dd4fd59bc4c 100644 --- a/tools/db_sanity_test.cc +++ b/tools/db_sanity_test.cc @@ -88,12 +88,12 @@ class SanityTest { class SanityTestBasic : public SanityTest { public: explicit SanityTestBasic(const std::string& path) : SanityTest(path) {} - virtual Options GetOptions() const override { + Options GetOptions() const override { Options options; options.create_if_missing = true; return options; } - virtual std::string Name() const override { return "Basic"; } + std::string Name() const override { return "Basic"; } }; class SanityTestSpecialComparator : public SanityTest { @@ -103,23 +103,20 @@ class SanityTestSpecialComparator : public SanityTest { options_.comparator = new NewComparator(); } ~SanityTestSpecialComparator() { delete options_.comparator; } - virtual Options GetOptions() const override { return options_; } - virtual std::string Name() const override { return "SpecialComparator"; } + Options GetOptions() const override { return options_; } + std::string Name() const override { return "SpecialComparator"; } private: class NewComparator : public Comparator { public: - virtual const char* Name() const override { - return "rocksdb.NewComparator"; - } - virtual int Compare(const Slice& a, const Slice& b) const override { + const char* Name() const override { return "rocksdb.NewComparator"; } + int Compare(const Slice& a, const Slice& b) const override { return BytewiseComparator()->Compare(a, b); } - virtual void FindShortestSeparator(std::string* s, - const Slice& l) const override { + void FindShortestSeparator(std::string* s, const Slice& l) const override { BytewiseComparator()->FindShortestSeparator(s, l); } - virtual void FindShortSuccessor(std::string* key) const override { + void FindShortSuccessor(std::string* key) const override { BytewiseComparator()->FindShortSuccessor(key); } }; @@ -132,8 +129,8 @@ class SanityTestZlibCompression : public SanityTest { : SanityTest(path) { options_.compression = kZlibCompression; } - virtual Options GetOptions() const override { return options_; } - virtual std::string Name() const override { return "ZlibCompression"; } + Options GetOptions() const override { return options_; } + std::string Name() const override { return "ZlibCompression"; } private: Options options_; @@ -150,10 +147,8 @@ class SanityTestZlibCompressionVersion2 : public SanityTest { #endif options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); } - virtual Options GetOptions() const override { return options_; } - virtual std::string Name() const override { - return "ZlibCompressionVersion2"; - } + Options GetOptions() const override { return options_; } + std::string Name() const override { return "ZlibCompressionVersion2"; } private: Options options_; @@ -165,8 +160,8 @@ class SanityTestLZ4Compression : public SanityTest { : SanityTest(path) { options_.compression = kLZ4Compression; } - virtual Options GetOptions() const override { return options_; } - virtual std::string Name() const override { return "LZ4Compression"; } + Options GetOptions() const override { return options_; } + std::string Name() const override { return "LZ4Compression"; } private: Options options_; @@ -178,8 +173,8 @@ class SanityTestLZ4HCCompression : public SanityTest { : SanityTest(path) { options_.compression = kLZ4HCCompression; } - virtual Options GetOptions() const override { return options_; } - virtual std::string Name() const override { return "LZ4HCCompression"; } + Options GetOptions() const override { return options_; } + std::string Name() const override { return "LZ4HCCompression"; } private: Options options_; @@ -191,8 +186,8 @@ class SanityTestZSTDCompression : public SanityTest { : SanityTest(path) { options_.compression = kZSTD; } - virtual Options GetOptions() const override { return options_; } - virtual std::string Name() const override { return "ZSTDCompression"; } + Options GetOptions() const override { return options_; } + std::string Name() const override { return "ZSTDCompression"; } private: Options options_; @@ -207,8 +202,8 @@ class SanityTestPlainTableFactory : public SanityTest { options_.allow_mmap_reads = true; } ~SanityTestPlainTableFactory() {} - virtual Options GetOptions() const override { return options_; } - virtual std::string Name() const override { return "PlainTable"; } + Options GetOptions() const override { return options_; } + std::string Name() const override { return "PlainTable"; } private: Options options_; @@ -222,8 +217,8 @@ class SanityTestBloomFilter : public SanityTest { options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); } ~SanityTestBloomFilter() {} - virtual Options GetOptions() const override { return options_; } - virtual std::string Name() const override { return "BloomFilter"; } + Options GetOptions() const override { return options_; } + std::string Name() const override { return "BloomFilter"; } private: Options options_; diff --git a/tools/dump/db_dump_tool.cc b/tools/dump/db_dump_tool.cc index 92edd5512be..901efc8f888 100644 --- a/tools/dump/db_dump_tool.cc +++ b/tools/dump/db_dump_tool.cc @@ -197,16 +197,20 @@ bool DbUndumpTool::Run(const UndumpOptions& undump_options, std::unique_ptr keyscratch(new char[last_keysize]); std::unique_ptr valscratch(new char[last_valsize]); - while (1) { + while (true) { uint32_t keysize, valsize; ROCKSDB_NAMESPACE::Slice keyslice; ROCKSDB_NAMESPACE::Slice valslice; status = dumpfile->Read(4, &slice, scratch8); - if (!status.ok() || slice.size() != 4) break; + if (!status.ok() || slice.size() != 4) { + break; + } keysize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data()); if (keysize > last_keysize) { - while (keysize > last_keysize) last_keysize *= 2; + while (keysize > last_keysize) { + last_keysize *= 2; + } keyscratch = std::unique_ptr(new char[last_keysize]); } @@ -227,7 +231,9 @@ bool DbUndumpTool::Run(const UndumpOptions& undump_options, } valsize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data()); if (valsize > last_valsize) { - while (valsize > last_valsize) last_valsize *= 2; + while (valsize > last_valsize) { + last_valsize *= 2; + } valscratch = std::unique_ptr(new char[last_valsize]); } diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 578d3af2b2c..ce8b15015ed 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -38,7 +38,6 @@ #include "rocksdb/utilities/options_util.h" #include "rocksdb/write_batch.h" #include "rocksdb/write_buffer_manager.h" -#include "table/scoped_arena_iterator.h" #include "table/sst_file_dumper.h" #include "tools/ldb_cmd_impl.h" #include "util/cast_util.h" @@ -124,7 +123,7 @@ void DumpSstFile(Options options, std::string filename, bool output_hex, void DumpBlobFile(const std::string& filename, bool is_key_hex, bool is_value_hex, bool dump_uncompressed_blobs); -}; // namespace +} // namespace LDBCommand* LDBCommand::InitFromCmdLineArgs( int argc, char const* const* argv, const Options& options, @@ -132,7 +131,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( const std::vector* column_families) { std::vector args; for (int i = 1; i < argc; i++) { - args.push_back(argv[i]); + args.emplace_back(argv[i]); } return InitFromCmdLineArgs(args, options, ldb_options, column_families, SelectCommand); @@ -151,7 +150,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( LDBCommand* LDBCommand::InitFromCmdLineArgs( const std::vector& args, const Options& options, const LDBOptions& ldb_options, - const std::vector* /*column_families*/, + const std::vector* column_families, const std::function& selector) { // --x=y command line arguments are added as x->y map entries in // parsed_params.option_map. @@ -201,6 +200,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( if (command) { command->SetDBOptions(options); command->SetLDBOptions(ldb_options); + command->SetColumnFamilies(column_families); } return command; } @@ -209,6 +209,9 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) { if (parsed_params.cmd == GetCommand::Name()) { return new GetCommand(parsed_params.cmd_params, parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == MultiGetCommand::Name()) { + return new MultiGetCommand(parsed_params.cmd_params, + parsed_params.option_map, parsed_params.flags); } else if (parsed_params.cmd == GetEntityCommand::Name()) { return new GetEntityCommand(parsed_params.cmd_params, parsed_params.option_map, parsed_params.flags); @@ -767,6 +770,10 @@ void LDBCommand::OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) { } } + if (options_.comparator != nullptr) { + cf_opts->comparator = options_.comparator; + } + cf_opts->force_consistency_checks = force_consistency_checks_; if (use_table_options) { cf_opts->table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -931,10 +938,12 @@ void LDBCommand::OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) { // Second, overrides the options according to the CLI arguments and the // specific subcommand being run. void LDBCommand::PrepareOptions() { + std::vector column_families_from_options; + if (!create_if_missing_ && try_load_options_) { config_options_.env = options_.env; Status s = LoadLatestOptions(config_options_, db_path_, &options_, - &column_families_); + &column_families_from_options); if (!s.ok() && !s.IsNotFound()) { // Option file exists but load option file error. std::string current_version = std::to_string(ROCKSDB_MAJOR) + "." + @@ -960,7 +969,7 @@ void LDBCommand::PrepareOptions() { } // If merge operator is not set, set a string append operator. - for (auto& cf_entry : column_families_) { + for (auto& cf_entry : column_families_from_options) { if (!cf_entry.options.merge_operator) { cf_entry.options.merge_operator = MergeOperators::CreateStringAppendOperator(':'); @@ -978,22 +987,29 @@ void LDBCommand::PrepareOptions() { } if (column_families_.empty()) { - // Reads the MANIFEST to figure out what column families exist. In this - // case, the option overrides from the CLI argument/specific subcommand - // apply to all column families. - std::vector cf_list; - Status st = DB::ListColumnFamilies(options_, db_path_, &cf_list); - // It is possible the DB doesn't exist yet, for "create if not - // existing" case. The failure is ignored here. We rely on DB::Open() - // to give us the correct error message for problem with opening - // existing DB. - if (st.ok() && cf_list.size() > 1) { - // Ignore single column family DB. - for (auto cf_name : cf_list) { - column_families_.emplace_back(cf_name, options_); + // column_families not set. Either set it from MANIFEST or OPTIONS file. + if (column_families_from_options.empty()) { + // Reads the MANIFEST to figure out what column families exist. In this + // case, the option overrides from the CLI argument/specific subcommand + // apply to all column families. + std::vector cf_list; + Status st = DB::ListColumnFamilies(options_, db_path_, &cf_list); + // It is possible the DB doesn't exist yet, for "create if not + // existing" case. The failure is ignored here. We rely on DB::Open() + // to give us the correct error message for problem with opening + // existing DB. + if (st.ok() && cf_list.size() > 1) { + // Ignore single column family DB. + for (const auto& cf_name : cf_list) { + column_families_.emplace_back(cf_name, options_); + } } + } else { + SetColumnFamilies(&column_families_from_options); } - } else { + } + + if (!column_families_from_options.empty()) { // We got column families from the OPTIONS file. In this case, the option // overrides from the CLI argument/specific subcommand only apply to the // column family specified by `--column_family_name`. @@ -1371,7 +1387,7 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", options.daily_offpeak_time_utc, - /*error_handler=*/nullptr); + /*error_handler=*/nullptr, /*read_only=*/true); Status s = versions.DumpManifest(options, file, verbose, hex, json, cf_descs); if (!s.ok()) { fprintf(stderr, "Error in processing file %s %s\n", file.c_str(), @@ -1402,8 +1418,7 @@ ManifestDumpCommand::ManifestDumpCommand( options, flags, false, BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX, ARG_JSON})), verbose_(false), - json_(false), - path_("") { + json_(false) { verbose_ = IsFlagPresent(flags, ARG_VERBOSE); json_ = IsFlagPresent(flags, ARG_JSON); @@ -1516,7 +1531,7 @@ Status GetLiveFilesChecksumInfoFromVersionSet(Options options, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", options.daily_offpeak_time_utc, - /*error_handler=*/nullptr); + /*error_handler=*/nullptr, /*read_only=*/true); std::vector cf_name_list; s = versions.ListColumnFamilies(&cf_name_list, db_path, immutable_db_options.fs.get()); @@ -1549,8 +1564,7 @@ FileChecksumDumpCommand::FileChecksumDumpCommand( const std::map& options, const std::vector& flags) : LDBCommand(options, flags, false, - BuildCmdLineOptions({ARG_PATH, ARG_HEX})), - path_("") { + BuildCmdLineOptions({ARG_PATH, ARG_HEX})) { auto itr = options.find(ARG_PATH); if (itr != options.end()) { path_ = itr->second; @@ -1676,7 +1690,7 @@ void ListColumnFamiliesCommand::DoCommand() { } else { fprintf(stdout, "Column families in %s: \n{", db_path_.c_str()); bool first = true; - for (auto cf : column_families) { + for (const auto& cf : column_families) { if (!first) { fprintf(stdout, ", "); } @@ -1909,11 +1923,16 @@ void InternalDumpCommand::DoCommand() { s1 = 0; row = ikey.Encode().ToString(); val = key_version.value; - for (k = 0; row[k] != '\x01' && row[k] != '\0'; k++) s1++; - for (k = 0; val[k] != '\x01' && val[k] != '\0'; k++) s1++; + for (k = 0; row[k] != '\x01' && row[k] != '\0'; k++) { + s1++; + } + for (k = 0; val[k] != '\x01' && val[k] != '\0'; k++) { + s1++; + } for (int j = 0; row[j] != delim_[0] && row[j] != '\0' && row[j] != '\x01'; - j++) + j++) { rtype1 += row[j]; + } if (rtype2.compare("") && rtype2.compare(rtype1) != 0) { fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n", rtype2.c_str(), c, s2); @@ -1959,7 +1978,9 @@ void InternalDumpCommand::DoCommand() { } // Terminate if maximum number of keys have been dumped - if (max_keys_ > 0 && count >= max_keys_) break; + if (max_keys_ > 0 && count >= max_keys_) { + break; + } } if (count_delim_) { fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n", @@ -2198,9 +2219,13 @@ void DBDumperCommand::DoDumpCommand() { for (; iter->Valid(); iter->Next()) { int rawtime = 0; // If end marker was specified, we stop before it - if (!null_to_ && (iter->key().ToString() >= to_)) break; + if (!null_to_ && (iter->key().ToString() >= to_)) { + break; + } // Terminate if maximum number of keys have been dumped - if (max_keys == 0) break; + if (max_keys == 0) { + break; + } if (is_db_ttl_) { TtlIterator* it_ttl = static_cast_with_check(iter); rawtime = it_ttl->ttl_timestamp(); @@ -2221,8 +2246,9 @@ void DBDumperCommand::DoDumpCommand() { row = iter->key().ToString(); val = iter->value().ToString(); s1 = row.size() + val.size(); - for (int j = 0; row[j] != delim_[0] && row[j] != '\0'; j++) + for (int j = 0; row[j] != delim_[0] && row[j] != '\0'; j++) { rtype1 += row[j]; + } if (rtype2.compare("") && rtype2.compare(rtype1) != 0) { fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n", rtype2.c_str(), c, s2); @@ -2299,7 +2325,7 @@ ReduceDBLevelsCommand::ReduceDBLevelsCommand( std::vector ReduceDBLevelsCommand::PrepareArgs( const std::string& db_path, int new_levels, bool print_old_level) { std::vector ret; - ret.push_back("reduce_levels"); + ret.emplace_back("reduce_levels"); ret.push_back("--" + ARG_DB + "=" + db_path); ret.push_back("--" + ARG_NEW_LEVELS + "=" + std::to_string(new_levels)); if (print_old_level) { @@ -2339,7 +2365,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, int* levels) { /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", opt.daily_offpeak_time_utc, - /*error_handler=*/nullptr); + /*error_handler=*/nullptr, /*read_only=*/true); std::vector dummy; ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, ColumnFamilyOptions(opt)); @@ -2668,7 +2694,7 @@ class InMemoryHandler : public WriteBatch::Handler { return Status::OK(); } - ~InMemoryHandler() override {} + ~InMemoryHandler() override = default; protected: Handler::OptionState WriteAfterCommit() const override { @@ -2707,8 +2733,9 @@ void DumpWalFile(Options options, std::string wal_file, bool print_header, // we need the log number, but ParseFilename expects dbname/NNN.log. std::string sanitized = wal_file; size_t lastslash = sanitized.rfind('/'); - if (lastslash != std::string::npos) + if (lastslash != std::string::npos) { sanitized = sanitized.substr(lastslash + 1); + } if (!ParseFileName(sanitized, &log_number, &type)) { // bogus input, carry on as best we can log_number = 0; @@ -2853,6 +2880,8 @@ void GetCommand::DoCommand() { if (st.ok()) { fprintf(stdout, "%s\n", (is_value_hex_ ? StringToHex(value) : value).c_str()); + } else if (st.IsNotFound()) { + fprintf(stdout, "Key not found\n"); } else { std::stringstream oss; oss << "Get failed: " << st.ToString(); @@ -2862,6 +2891,69 @@ void GetCommand::DoCommand() { // ---------------------------------------------------------------------------- +MultiGetCommand::MultiGetCommand( + const std::vector& params, + const std::map& options, + const std::vector& flags) + : LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) { + if (params.size() < 1) { + exec_state_ = LDBCommandExecuteResult::Failed( + "At least one must be specified for multi_get."); + } else { + for (size_t i = 0; i < params.size(); ++i) { + std::string key = params.at(i); + keys_.emplace_back(is_key_hex_ ? HexToString(key) : key); + } + } +} + +void MultiGetCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(MultiGetCommand::Name()); + ret.append(" ..."); + ret.append("\n"); +} + +void MultiGetCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + size_t num_keys = keys_.size(); + std::vector key_slices; + std::vector values(num_keys); + std::vector statuses(num_keys); + for (const std::string& key : keys_) { + key_slices.emplace_back(key); + } + db_->MultiGet(ReadOptions(), GetCfHandle(), num_keys, key_slices.data(), + values.data(), statuses.data()); + + bool failed = false; + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + fprintf(stdout, is_value_hex_ ? "%s%s0x%s\n" : "%s%s%s\n", + (is_key_hex_ ? StringToHex(keys_[i]) : keys_[i]).c_str(), DELIM, + values[i].ToString(is_value_hex_).c_str()); + } else if (statuses[i].IsNotFound()) { + fprintf(stdout, "Key not found: %s\n", + (is_key_hex_ ? StringToHex(keys_[i]) : keys_[i]).c_str()); + } else { + fprintf(stderr, "Status for key %s: %s\n", + (is_key_hex_ ? StringToHex(keys_[i]) : keys_[i]).c_str(), + statuses[i].ToString().c_str()); + failed = false; + } + } + if (failed) { + exec_state_ = + LDBCommandExecuteResult::Failed("one or more keys had non-okay status"); + } +} + +// ---------------------------------------------------------------------------- + GetEntityCommand::GetEntityCommand( const std::vector& params, const std::map& options, @@ -2984,9 +3076,8 @@ BatchPutCommand::BatchPutCommand( for (size_t i = 0; i < params.size(); i += 2) { std::string key = params.at(i); std::string value = params.at(i + 1); - key_values_.push_back(std::pair( - is_key_hex_ ? HexToString(key) : key, - is_value_hex_ ? HexToString(value) : value)); + key_values_.emplace_back(is_key_hex_ ? HexToString(key) : key, + is_value_hex_ ? HexToString(value) : value); } } create_if_missing_ = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING); @@ -4367,8 +4458,10 @@ UnsafeRemoveSstFileCommand::UnsafeRemoveSstFileCommand( } void UnsafeRemoveSstFileCommand::DoCommand() { - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + const WriteOptions write_options; + PrepareOptions(); OfflineManifestWriter w(options_, db_path_); @@ -4393,7 +4486,7 @@ void UnsafeRemoveSstFileCommand::DoCommand() { s = options_.env->GetFileSystem()->NewDirectory(db_path_, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = w.LogAndApply(read_options, cfd, &edit, db_dir.get()); + s = w.LogAndApply(read_options, write_options, cfd, &edit, db_dir.get()); } } diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h index d56ccf662e1..1a2a989433d 100644 --- a/tools/ldb_cmd_impl.h +++ b/tools/ldb_cmd_impl.h @@ -208,9 +208,9 @@ class UpdateManifestCommand : public LDBCommand { const std::vector& flags); static void Help(std::string& ret); - virtual void DoCommand() override; + void DoCommand() override; - virtual bool NoDBOpen() override { return true; } + bool NoDBOpen() override { return true; } private: bool verbose_; @@ -403,6 +403,22 @@ class GetCommand : public LDBCommand { std::string key_; }; +class MultiGetCommand : public LDBCommand { + public: + static std::string Name() { return "multi_get"; } + + MultiGetCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::vector keys_; +}; + class GetEntityCommand : public LDBCommand { public: static std::string Name() { return "get_entity"; } diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc index 465d1eb3171..711a313db67 100644 --- a/tools/ldb_cmd_test.cc +++ b/tools/ldb_cmd_test.cc @@ -14,6 +14,7 @@ #include "file/filename.h" #include "port/stack_trace.h" #include "rocksdb/advanced_options.h" +#include "rocksdb/comparator.h" #include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/file_checksum.h" @@ -185,7 +186,7 @@ class FileChecksumTestHelper { public: FileChecksumTestHelper(Options& options, DB* db, std::string db_name) : options_(options), db_(db), dbname_(db_name) {} - ~FileChecksumTestHelper() {} + ~FileChecksumTestHelper() = default; // Verify the checksum information in Manifest. Status VerifyChecksumInManifest( @@ -208,7 +209,8 @@ class FileChecksumTestHelper { ImmutableDBOptions immutable_db_options(options_); VersionSet versions(dbname_, &immutable_db_options, sopt, tc.get(), &wb, &wc, nullptr, nullptr, "", "", - options_.daily_offpeak_time_utc, nullptr); + options_.daily_offpeak_time_utc, nullptr, + /*read_only=*/false); std::vector cf_name_list; Status s; s = versions.ListColumnFamilies(&cf_name_list, dbname_, @@ -233,8 +235,8 @@ class FileChecksumTestHelper { return Status::Corruption("The number of files does not match!"); } for (size_t i = 0; i < live_files.size(); i++) { - std::string stored_checksum = ""; - std::string stored_func_name = ""; + std::string stored_checksum; + std::string stored_func_name; s = checksum_list->SearchOneFileChecksum( live_files[i].file_number, &stored_checksum, &stored_func_name); if (s.IsNotFound()) { @@ -269,7 +271,7 @@ class FileChecksumTestHelper { break; } } - EXPECT_OK(db_->EnableFileDeletions(/*force=*/false)); + EXPECT_OK(db_->EnableFileDeletions()); return cs; } }; @@ -634,9 +636,9 @@ TEST_F(LdbCmdTest, OptionParsing) { opts.env = TryLoadCustomOrDefaultEnv(); { std::vector args; - args.push_back("scan"); - args.push_back("--ttl"); - args.push_back("--timestamp"); + args.emplace_back("scan"); + args.emplace_back("--ttl"); + args.emplace_back("--timestamp"); LDBCommand* command = ROCKSDB_NAMESPACE::LDBCommand::InitFromCmdLineArgs( args, opts, LDBOptions(), nullptr); const std::vector flags = command->TEST_GetFlags(); @@ -648,9 +650,9 @@ TEST_F(LdbCmdTest, OptionParsing) { // test parsing options which contains equal sign in the option value { std::vector args; - args.push_back("scan"); - args.push_back("--db=/dev/shm/ldbtest/"); - args.push_back( + args.emplace_back("scan"); + args.emplace_back("--db=/dev/shm/ldbtest/"); + args.emplace_back( "--from='abcd/efg/hijk/lmn/" "opq:__rst.uvw.xyz?a=3+4+bcd+efghi&jk=lm_no&pq=rst-0&uv=wx-8&yz=a&bcd_" "ef=gh.ijk'"); @@ -701,7 +703,7 @@ TEST_F(LdbCmdTest, ListFileTombstone) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) { - std::string* out_str = reinterpret_cast(arg); + std::string* out_str = static_cast(arg); // Count number of tombstones printed int num_tb = 0; @@ -734,7 +736,7 @@ TEST_F(LdbCmdTest, ListFileTombstone) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) { - std::string* out_str = reinterpret_cast(arg); + std::string* out_str = static_cast(arg); // Count number of tombstones printed int num_tb = 0; @@ -794,7 +796,7 @@ TEST_F(LdbCmdTest, DisableConsistencyChecks) { SyncPoint::GetInstance()->SetCallBack( "Version::PrepareAppend:forced_check", [&](void* arg) { - bool* forced = reinterpret_cast(arg); + bool* forced = static_cast(arg); ASSERT_TRUE(*forced); }); SyncPoint::GetInstance()->EnableProcessing(); @@ -814,7 +816,7 @@ TEST_F(LdbCmdTest, DisableConsistencyChecks) { SyncPoint::GetInstance()->SetCallBack( "Version::PrepareAppend:forced_check", [&](void* arg) { - bool* forced = reinterpret_cast(arg); + bool* forced = static_cast(arg); ASSERT_TRUE(*forced); }); SyncPoint::GetInstance()->EnableProcessing(); @@ -835,8 +837,7 @@ TEST_F(LdbCmdTest, DisableConsistencyChecks) { SyncPoint::GetInstance()->SetCallBack( "ColumnFamilyData::ColumnFamilyData", [&](void* arg) { - ColumnFamilyOptions* cfo = - reinterpret_cast(arg); + ColumnFamilyOptions* cfo = static_cast(arg); ASSERT_FALSE(cfo->force_consistency_checks); }); SyncPoint::GetInstance()->EnableProcessing(); @@ -1070,7 +1071,7 @@ TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) { auto test_fs = std::make_shared(FileSystem::Default()); std::unique_ptr env(new CompositeEnvWrapper(Env::Default(), test_fs)); Options opts; - opts.bottommost_temperature = Temperature::kWarm; + opts.last_level_temperature = Temperature::kWarm; opts.level0_file_num_compaction_trigger = 10; opts.create_if_missing = true; opts.env = env.get(); @@ -1207,6 +1208,51 @@ TEST_F(LdbCmdTest, RenameDbAndLoadOptions) { ASSERT_OK(DestroyDB(new_dbname, opts)); } +class MyComparator : public Comparator { + public: + int Compare(const Slice& a, const Slice& b) const override { + return a.compare(b); + } + void FindShortSuccessor(std::string* /*key*/) const override {} + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + const char* Name() const override { return "my_comparator"; } +}; + +TEST_F(LdbCmdTest, CustomComparator) { + Env* env = TryLoadCustomOrDefaultEnv(); + MyComparator my_comparator; + Options opts; + opts.env = env; + opts.create_if_missing = true; + opts.create_missing_column_families = true; + opts.comparator = &my_comparator; + + std::string dbname = test::PerThreadDBPath(env, "ldb_cmd_test"); + DB* db = nullptr; + + std::vector cfds = { + {kDefaultColumnFamilyName, opts}, {"cf1", opts}, {"cf2", opts}}; + std::vector handles; + ASSERT_OK(DestroyDB(dbname, opts)); + ASSERT_OK(DB::Open(opts, dbname, cfds, &handles, &db)); + ASSERT_OK(db->Put(WriteOptions(), "k1", "v1")); + + for (auto& h : handles) { + ASSERT_OK(db->DestroyColumnFamilyHandle(h)); + } + delete db; + + char arg1[] = "./ldb"; + std::string arg2 = "--db=" + dbname; + char arg3[] = "get"; + char arg4[] = "k1"; + char* argv[] = {arg1, const_cast(arg2.c_str()), arg3, arg4}; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), &cfds)); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/tools/ldb_test.py b/tools/ldb_test.py index cde0414713d..0d6b125f027 100644 --- a/tools/ldb_test.py +++ b/tools/ldb_test.py @@ -119,7 +119,9 @@ def testSimpleStringPutGet(self): self.assertRunOK("put x2 y2", "OK") self.assertRunOK("get x1", "y1") self.assertRunOK("get x2", "y2") - self.assertRunFAIL("get x3") + self.assertRunOK("multi_get x1 x2", "x1 ==> y1\nx2 ==> y2") + self.assertRunOK("get x3", "Key not found") + self.assertRunOK("multi_get x3", "Key not found: x3") self.assertRunFAIL("put_entity x4") self.assertRunFAIL("put_entity x4 cv1") @@ -311,6 +313,8 @@ def testHexPutGet(self): self.assertRunOK("get --hex 0x6131", "0x6231") self.assertRunOK("get a2", "b2") self.assertRunOK("get --hex 0x6132", "0x6232") + self.assertRunOK("multi_get --hex 0x6131 0x6132", "0x6131 ==> 0x6231\n0x6132 ==> 0x6232") + self.assertRunOK("multi_get --hex 0x6131 0xBEEF", "0x6131 ==> 0x6231\nKey not found: 0xBEEF") self.assertRunOK("get --key_hex 0x6132", "b2") self.assertRunOK("get --key_hex --value_hex 0x6132", "0x6232") self.assertRunOK("get --value_hex a2", "0x6232") diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc index 0d6bc3c5b89..8068ef7b2cd 100644 --- a/tools/ldb_tool.cc +++ b/tools/ldb_tool.cc @@ -10,7 +10,7 @@ namespace ROCKSDB_NAMESPACE { -LDBOptions::LDBOptions() {} +LDBOptions::LDBOptions() = default; void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options, const char* /*exec_name*/, bool to_stderr) { @@ -90,6 +90,7 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options, ret.append("Data Access Commands:\n"); PutCommand::Help(ret); GetCommand::Help(ret); + MultiGetCommand::Help(ret); BatchPutCommand::Help(ret); ScanCommand::Help(ret); DeleteCommand::Help(ret); diff --git a/tools/simulated_hybrid_file_system.cc b/tools/simulated_hybrid_file_system.cc index 2b9aa0950fe..7d9b9dc6e20 100644 --- a/tools/simulated_hybrid_file_system.cc +++ b/tools/simulated_hybrid_file_system.cc @@ -86,7 +86,9 @@ SimulatedHybridFileSystem::~SimulatedHybridFileSystem() { metadata += f; metadata += "\n"; } - IOStatus s = WriteStringToFile(target(), metadata, metadata_file_name_, true); + IOOptions opts; + IOStatus s = + WriteStringToFile(target(), metadata, metadata_file_name_, true, opts); if (!s.ok()) { fprintf(stderr, "Error writing to file %s: %s", metadata_file_name_.c_str(), s.ToString().c_str()); @@ -240,4 +242,3 @@ IOStatus SimulatedWritableFile::Sync(const IOOptions& options, return target()->Sync(options, dbg); } } // namespace ROCKSDB_NAMESPACE - diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc index f0b71bf8ea0..a5c567b38af 100644 --- a/tools/sst_dump_test.cc +++ b/tools/sst_dump_test.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include +#include #include "db/wide/wide_column_serialization.h" #include "file/random_access_file_reader.h" @@ -16,9 +16,11 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/sst_dump_tool.h" #include "table/block_based/block_based_table_factory.h" +#include "table/sst_file_dumper.h" #include "table/table_builder.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/defer.h" namespace ROCKSDB_NAMESPACE { @@ -107,7 +109,7 @@ class SSTDumpToolTest : public testing::Test { } void createSST(const Options& opts, const std::string& file_name, - uint32_t wide_column_one_in = 0) { + uint32_t wide_column_one_in = 0, bool range_del = false) { Env* test_env = opts.env; FileOptions file_options(opts); ReadOptions read_options; @@ -116,17 +118,19 @@ class SSTDumpToolTest : public testing::Test { ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator); std::unique_ptr tb; - IntTblPropCollectorFactories int_tbl_prop_collector_factories; + InternalTblPropCollFactories internal_tbl_prop_coll_factories; std::unique_ptr file_writer; ASSERT_OK(WritableFileWriter::Create(test_env->GetFileSystem(), file_name, file_options, &file_writer, nullptr)); std::string column_family_name; int unknown_level = -1; + const WriteOptions write_options; tb.reset(opts.table_factory->NewTableBuilder( TableBuilderOptions( - imoptions, moptions, ikc, &int_tbl_prop_collector_factories, - CompressionType::kNoCompression, CompressionOptions(), + imoptions, moptions, read_options, write_options, ikc, + &internal_tbl_prop_coll_factories, CompressionType::kNoCompression, + CompressionOptions(), TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, column_family_name, unknown_level), file_writer.get())); @@ -135,7 +139,7 @@ class SSTDumpToolTest : public testing::Test { uint32_t num_keys = kNumKey; const char* comparator_name = ikc.user_comparator()->Name(); if (strcmp(comparator_name, ReverseBytewiseComparator()->Name()) == 0) { - for (int32_t i = num_keys; i >= 0; i--) { + for (int32_t i = num_keys; i > 0; i--) { if (wide_column_one_in == 0 || i % wide_column_one_in != 0) { tb->Add(MakeKey(i), MakeValue(i)); } else { @@ -150,7 +154,12 @@ class SSTDumpToolTest : public testing::Test { tb->Add(MakeKeyWithTimeStamp(i, 100 + i), MakeValue(i)); } } else { - for (uint32_t i = 0; i < num_keys; i++) { + uint32_t i = 0; + if (range_del) { + tb->Add(MakeKey(i, kTypeRangeDeletion), MakeValue(i + 1)); + i = 1; + } + for (; i < num_keys; i++) { if (wide_column_one_in == 0 || i % wide_column_one_in != 0) { tb->Add(MakeKey(i), MakeValue(i)); } else { @@ -160,7 +169,7 @@ class SSTDumpToolTest : public testing::Test { } } ASSERT_OK(tb->Finish()); - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); } protected: @@ -417,9 +426,9 @@ TEST_F(SSTDumpToolTest, ValidSSTPath) { std::string sst_file = MakeFilePath("rocksdb_sst_test.sst"); createSST(opts, sst_file); std::string text_file = MakeFilePath("text_file"); - ASSERT_OK(WriteStringToFile(opts.env, "Hello World!", text_file)); + ASSERT_OK(WriteStringToFile(opts.env, "Hello World!", text_file, false)); std::string fake_sst = MakeFilePath("fake_sst.sst"); - ASSERT_OK(WriteStringToFile(opts.env, "Not an SST file!", fake_sst)); + ASSERT_OK(WriteStringToFile(opts.env, "Not an SST file!", fake_sst, false)); for (const auto& command_arg : {"--command=verify", "--command=identify"}) { snprintf(usage[1], kOptLength, "%s", command_arg); @@ -483,6 +492,123 @@ TEST_F(SSTDumpToolTest, RawOutput) { } } +TEST_F(SSTDumpToolTest, SstFileDumperMmapReads) { + Options opts; + opts.env = env(); + std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); + createSST(opts, file_path, 10); + + EnvOptions env_opts; + uint64_t data_size = 0; + + // Test all combinations of mmap read options + for (int i = 0; i < 4; ++i) { + SaveAndRestore sar_opts(&opts.allow_mmap_reads, (i & 1) != 0); + SaveAndRestore sar_env_opts(&env_opts.use_mmap_reads, (i & 2) != 0); + + SstFileDumper dumper(opts, file_path, Temperature::kUnknown, + 1024 /*readahead_size*/, true /*verify_checksum*/, + false /*output_hex*/, false /*decode_blob_index*/, + env_opts); + ASSERT_OK(dumper.getStatus()); + std::shared_ptr tp; + ASSERT_OK(dumper.ReadTableProperties(&tp)); + ASSERT_NE(tp.get(), nullptr); + if (i == 0) { + // Verify consistency of a populated field with some entropy + data_size = tp->data_size; + ASSERT_GT(data_size, 0); + } else { + ASSERT_EQ(data_size, tp->data_size); + } + } + + cleanup(opts, file_path); +} + +TEST_F(SSTDumpToolTest, SstFileDumperVerifyNumRecords) { + Options opts; + opts.env = env(); + + EnvOptions env_opts; + std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); + { + createSST(opts, file_path, 10); + SstFileDumper dumper(opts, file_path, Temperature::kUnknown, + 1024 /*readahead_size*/, true /*verify_checksum*/, + false /*output_hex*/, false /*decode_blob_index*/, + env_opts, /*silent=*/true); + ASSERT_OK(dumper.getStatus()); + ASSERT_OK(dumper.ReadSequential( + /*print_kv=*/false, + /*read_num_limit=*/std::numeric_limits::max(), + /*has_from=*/false, /*from_key=*/"", + /*has_to=*/false, /*to_key=*/"")); + cleanup(opts, file_path); + } + + { + // Test with range del + createSST(opts, file_path, 10, /*range_del=*/true); + SstFileDumper dumper(opts, file_path, Temperature::kUnknown, + 1024 /*readahead_size*/, true /*verify_checksum*/, + false /*output_hex*/, false /*decode_blob_index*/, + env_opts, /*silent=*/true); + ASSERT_OK(dumper.getStatus()); + ASSERT_OK(dumper.ReadSequential( + /*print_kv=*/false, + /*read_num_limit=*/std::numeric_limits::max(), + /*has_from=*/false, /*from_key=*/"", + /*has_to=*/false, /*to_key=*/"")); + cleanup(opts, file_path); + } + + { + SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) { + TableProperties* props = reinterpret_cast(arg); + props->num_entries = kNumKey + 2; + }); + SyncPoint::GetInstance()->EnableProcessing(); + createSST(opts, file_path, 10); + SstFileDumper dumper(opts, file_path, Temperature::kUnknown, + 1024 /*readahead_size*/, true /*verify_checksum*/, + false /*output_hex*/, false /*decode_blob_index*/, + env_opts, /*silent=*/true); + ASSERT_OK(dumper.getStatus()); + Status s = dumper.ReadSequential( + /*print_kv=*/false, + /*read_num_limit==*/std::numeric_limits::max(), + /*has_from=*/false, /*from_key=*/"", + /*has_to=*/false, /*to_key=*/""); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + std::strstr("Table property expects 1026 entries when excluding range " + "deletions, but scanning the table returned 1024 entries", + s.getState())); + + // Validation is not performed when read_num, has_from, has_to are set + ASSERT_OK(dumper.ReadSequential( + /*print_kv=*/false, /*read_num_limit=*/10, + /*has_from=*/false, /*from_key=*/"", + /*has_to=*/false, /*to_key=*/"")); + + ASSERT_OK(dumper.ReadSequential( + /*print_kv=*/false, + /*read_num_limit=*/std::numeric_limits::max(), + /*has_from=*/true, /*from_key=*/MakeKey(100), + /*has_to=*/false, /*to_key=*/"")); + + ASSERT_OK(dumper.ReadSequential( + /*print_kv=*/false, + /*read_num_limit=*/std::numeric_limits::max(), + /*has_from=*/false, /*from_key=*/"", + /*has_to=*/true, /*to_key=*/MakeKey(100))); + + cleanup(opts, file_path); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 1b269043ab2..ede23631339 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -55,6 +55,8 @@ void print_help(bool to_stderr) { --command=check|scan|raw|verify|identify check: Iterate over entries in files but don't print anything except if an error is encountered (default command) + When read_num, from and to are not set, it compares the number of keys read with num_entries in table + property and will report corruption if there is a mismatch. scan: Iterate over entries in files and print them to screen raw: Dump all the table contents to _dump.txt verify: Iterate all the blocks in files verifying checksum to detect possible corruption but don't print anything except if a corruption is encountered @@ -400,7 +402,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) { // that whether it is a valid sst or not // (A directory "file" is not a valid sst) filenames.clear(); - filenames.push_back(dir_or_file); + filenames.emplace_back(dir_or_file); dir = false; } @@ -468,7 +470,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) { fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str()); exit(1); } else { - fprintf(stdout, "raw dump written to file %s\n", &out_filename[0]); + fprintf(stdout, "raw dump written to file %s\n", out_filename.data()); } continue; } diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc index e7d090eb291..1d5c870540a 100644 --- a/tools/trace_analyzer_test.cc +++ b/tools/trace_analyzer_test.cc @@ -56,7 +56,7 @@ class TraceAnalyzerTest : public testing::Test { dbname_ = test_path_ + "/db"; } - ~TraceAnalyzerTest() override {} + ~TraceAnalyzerTest() override = default; void GenerateTrace(std::string trace_path) { Options options; @@ -87,11 +87,11 @@ class TraceAnalyzerTest : public testing::Test { ASSERT_OK(batch.DeleteRange("e", "f")); ASSERT_OK(db_->Write(wo, &batch)); std::vector keys; - keys.push_back("a"); - keys.push_back("b"); - keys.push_back("df"); - keys.push_back("gege"); - keys.push_back("hjhjhj"); + keys.emplace_back("a"); + keys.emplace_back("b"); + keys.emplace_back("df"); + keys.emplace_back("gege"); + keys.emplace_back("hjhjhj"); std::vector values; std::vector ss = db_->MultiGet(ro, keys, &values); ASSERT_GE(ss.size(), 0); @@ -176,8 +176,6 @@ class TraceAnalyzerTest : public testing::Test { ASSERT_EQ(result[i][0], cnt[i][0]); } } - - return; } void AnalyzeTrace(std::vector& paras_diff, diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc index 00a4da04616..c484959c437 100644 --- a/tools/trace_analyzer_tool.cc +++ b/tools/trace_analyzer_tool.cc @@ -170,18 +170,26 @@ namespace ROCKSDB_NAMESPACE { const size_t kShadowValueSize = 10; std::map taOptToIndex = { - {"get", 0}, {"put", 1}, - {"delete", 2}, {"single_delete", 3}, - {"range_delete", 4}, {"merge", 5}, - {"iterator_Seek", 6}, {"iterator_SeekForPrev", 7}, - {"multiget", 8}}; + {"get", kGet}, + {"put", kPut}, + {"delete", kDelete}, + {"single_delete", kSingleDelete}, + {"range_delete", kRangeDelete}, + {"merge", kMerge}, + {"iterator_Seek", kIteratorSeek}, + {"iterator_SeekForPrev", kIteratorSeekForPrev}, + {"multiget", kMultiGet}}; std::map taIndexToOpt = { - {0, "get"}, {1, "put"}, - {2, "delete"}, {3, "single_delete"}, - {4, "range_delete"}, {5, "merge"}, - {6, "iterator_Seek"}, {7, "iterator_SeekForPrev"}, - {8, "multiget"}}; + {kGet, "get"}, + {kPut, "put"}, + {kDelete, "delete"}, + {kSingleDelete, "single_delete"}, + {kRangeDelete, "range_delete"}, + {kMerge, "merge"}, + {kIteratorSeek, "iterator_Seek"}, + {kIteratorSeekForPrev, "iterator_SeekForPrev"}, + {kMultiGet, "multiget"}}; namespace { @@ -201,7 +209,7 @@ uint64_t MultiplyCheckOverflow(uint64_t op1, uint64_t op2) { AnalyzerOptions::AnalyzerOptions() : correlation_map(kTaTypeNum, std::vector(kTaTypeNum, -1)) {} -AnalyzerOptions::~AnalyzerOptions() {} +AnalyzerOptions::~AnalyzerOptions() = default; void AnalyzerOptions::SparseCorrelationInput(const std::string& in_str) { std::string cur = in_str; @@ -214,14 +222,14 @@ void AnalyzerOptions::SparseCorrelationInput(const std::string& in_str) { exit(1); } std::string opt1, opt2; - std::size_t split = cur.find_first_of(","); + std::size_t split = cur.find_first_of(','); if (split != std::string::npos) { opt1 = cur.substr(1, split - 1); } else { fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str()); exit(1); } - std::size_t end = cur.find_first_of("]"); + std::size_t end = cur.find_first_of(']'); if (end != std::string::npos) { opt2 = cur.substr(split + 1, end - split - 1); } else { @@ -232,8 +240,7 @@ void AnalyzerOptions::SparseCorrelationInput(const std::string& in_str) { if (taOptToIndex.find(opt1) != taOptToIndex.end() && taOptToIndex.find(opt2) != taOptToIndex.end()) { - correlation_list.push_back( - std::make_pair(taOptToIndex[opt1], taOptToIndex[opt2])); + correlation_list.emplace_back(taOptToIndex[opt1], taOptToIndex[opt2]); } else { fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str()); exit(1); @@ -245,7 +252,6 @@ void AnalyzerOptions::SparseCorrelationInput(const std::string& in_str) { correlation_map[it.first][it.second] = sequence; sequence++; } - return; } // The trace statistic struct constructor @@ -264,7 +270,7 @@ TraceStats::TraceStats() { a_ave_qps = 0.0; } -TraceStats::~TraceStats() {} +TraceStats::~TraceStats() = default; // The trace analyzer constructor TraceAnalyzer::TraceAnalyzer(std::string& trace_path, std::string& output_path, @@ -295,66 +301,66 @@ TraceAnalyzer::TraceAnalyzer(std::string& trace_path, std::string& output_path, } ta_.resize(kTaTypeNum); - ta_[0].type_name = "get"; + ta_[kGet].type_name = "get"; if (FLAGS_analyze_get) { - ta_[0].enabled = true; + ta_[kGet].enabled = true; } else { - ta_[0].enabled = false; + ta_[kGet].enabled = false; } - ta_[1].type_name = "put"; + ta_[kPut].type_name = "put"; if (FLAGS_analyze_put) { - ta_[1].enabled = true; + ta_[kPut].enabled = true; } else { - ta_[1].enabled = false; + ta_[kPut].enabled = false; } - ta_[2].type_name = "delete"; + ta_[kDelete].type_name = "delete"; if (FLAGS_analyze_delete) { - ta_[2].enabled = true; + ta_[kDelete].enabled = true; } else { - ta_[2].enabled = false; + ta_[kDelete].enabled = false; } - ta_[3].type_name = "single_delete"; + ta_[kSingleDelete].type_name = "single_delete"; if (FLAGS_analyze_single_delete) { - ta_[3].enabled = true; + ta_[kSingleDelete].enabled = true; } else { - ta_[3].enabled = false; + ta_[kSingleDelete].enabled = false; } - ta_[4].type_name = "range_delete"; + ta_[kRangeDelete].type_name = "range_delete"; if (FLAGS_analyze_range_delete) { - ta_[4].enabled = true; + ta_[kRangeDelete].enabled = true; } else { - ta_[4].enabled = false; + ta_[kRangeDelete].enabled = false; } - ta_[5].type_name = "merge"; + ta_[kMerge].type_name = "merge"; if (FLAGS_analyze_merge) { - ta_[5].enabled = true; + ta_[kMerge].enabled = true; } else { - ta_[5].enabled = false; + ta_[kMerge].enabled = false; } - ta_[6].type_name = "iterator_Seek"; + ta_[kIteratorSeek].type_name = "iterator_Seek"; if (FLAGS_analyze_iterator) { - ta_[6].enabled = true; + ta_[kIteratorSeek].enabled = true; } else { - ta_[6].enabled = false; + ta_[kIteratorSeek].enabled = false; } - ta_[7].type_name = "iterator_SeekForPrev"; + ta_[kIteratorSeekForPrev].type_name = "iterator_SeekForPrev"; if (FLAGS_analyze_iterator) { - ta_[7].enabled = true; + ta_[kIteratorSeekForPrev].enabled = true; } else { - ta_[7].enabled = false; + ta_[kIteratorSeekForPrev].enabled = false; } - ta_[8].type_name = "multiget"; + ta_[kMultiGet].type_name = "multiget"; if (FLAGS_analyze_multiget) { - ta_[8].enabled = true; + ta_[kMultiGet].enabled = true; } else { - ta_[8].enabled = false; + ta_[kMultiGet].enabled = false; } for (int i = 0; i < kTaTypeNum; i++) { ta_[i].sample_count = 0; } } -TraceAnalyzer::~TraceAnalyzer() {} +TraceAnalyzer::~TraceAnalyzer() = default; // Prepare the processing // Initiate the global trace reader and writer here diff --git a/trace_replay/trace_record.cc b/trace_replay/trace_record.cc index 21df0275ddf..a4a4eb9f838 100644 --- a/trace_replay/trace_record.cc +++ b/trace_replay/trace_record.cc @@ -97,7 +97,7 @@ IteratorQueryTraceRecord::IteratorQueryTraceRecord( upper_.PinSelf(upper_bound); } -IteratorQueryTraceRecord::~IteratorQueryTraceRecord() {} +IteratorQueryTraceRecord::~IteratorQueryTraceRecord() = default; Slice IteratorQueryTraceRecord::GetLowerBound() const { return Slice(lower_); } diff --git a/trace_replay/trace_record_handler.h b/trace_replay/trace_record_handler.h index 88cf317ddad..d84b31567a9 100644 --- a/trace_replay/trace_record_handler.h +++ b/trace_replay/trace_record_handler.h @@ -22,16 +22,16 @@ class TraceExecutionHandler : public TraceRecord::Handler { public: TraceExecutionHandler(DB* db, const std::vector& handles); - virtual ~TraceExecutionHandler() override; - - virtual Status Handle(const WriteQueryTraceRecord& record, - std::unique_ptr* result) override; - virtual Status Handle(const GetQueryTraceRecord& record, - std::unique_ptr* result) override; - virtual Status Handle(const IteratorSeekQueryTraceRecord& record, - std::unique_ptr* result) override; - virtual Status Handle(const MultiGetQueryTraceRecord& record, - std::unique_ptr* result) override; + ~TraceExecutionHandler() override; + + Status Handle(const WriteQueryTraceRecord& record, + std::unique_ptr* result) override; + Status Handle(const GetQueryTraceRecord& record, + std::unique_ptr* result) override; + Status Handle(const IteratorSeekQueryTraceRecord& record, + std::unique_ptr* result) override; + Status Handle(const MultiGetQueryTraceRecord& record, + std::unique_ptr* result) override; private: DB* db_; diff --git a/trace_replay/trace_replay.cc b/trace_replay/trace_replay.cc index 126a8e248de..6ade8e316cf 100644 --- a/trace_replay/trace_replay.cc +++ b/trace_replay/trace_replay.cc @@ -58,8 +58,8 @@ Status TracerHelper::ParseTraceHeader(const Trace& header, int* trace_version, std::vector s_vec; int begin = 0, end; for (int i = 0; i < 3; i++) { - assert(header.payload.find("\t", begin) != std::string::npos); - end = static_cast(header.payload.find("\t", begin)); + assert(header.payload.find('\t', begin) != std::string::npos); + end = static_cast(header.payload.find('\t', begin)); s_vec.push_back(header.payload.substr(begin, end - begin)); begin = end + 1; } diff --git a/trace_replay/trace_replay.h b/trace_replay/trace_replay.h index 55908dcb7ed..1e4bc526d62 100644 --- a/trace_replay/trace_replay.h +++ b/trace_replay/trace_replay.h @@ -56,7 +56,7 @@ struct Trace { // example, if bit at position 0 is set in write payload, then the write batch // will be addedd. uint64_t payload_map = 0; - // Each trace type has its own payload_struct, which will be serilized in the + // Each trace type has its own payload_struct, which will be serialized in the // payload. std::string payload; diff --git a/unreleased_history/README.txt b/unreleased_history/README.txt index 1d641285d5e..ed77d815228 100644 --- a/unreleased_history/README.txt +++ b/unreleased_history/README.txt @@ -48,7 +48,7 @@ Updating HISTORY.md with release notes -------------------------------------- The script unreleased_history/release.sh does this. Run the script before -updating version.h to the next develpment release, so that the script will pick +updating version.h to the next development release, so that the script will pick up the version being released. You might want to start with $ DRY_RUN=1 unreleased_history/release.sh | less @@ -68,6 +68,6 @@ First, it was common to hit unnecessary merge conflicts when adding entries to HISTORY.md, which slowed development. Second, when a PR was opened before a release cut and landed after the release cut, it was easy to add the HISTORY entry to the wrong version's history. This new setup completely fixes both of -those issues, with perhaps slighly more initial work to create each entry. +those issues, with perhaps slightly more initial work to create each entry. There is also now an extra step in using `git blame` to map a release note to its source code implementation, but that is a relatively rare operation. diff --git a/util/autovector.h b/util/autovector.h index 79ee5de5725..39c7aabee57 100644 --- a/util/autovector.h +++ b/util/autovector.h @@ -61,7 +61,7 @@ class autovector { using iterator_category = std::random_access_iterator_tag; iterator_impl(TAutoVector* vect, size_t index) - : vect_(vect), index_(index){}; + : vect_(vect), index_(index){} iterator_impl(const iterator_impl&) = default; ~iterator_impl() {} iterator_impl& operator=(const iterator_impl&) = default; diff --git a/util/cast_util.h b/util/cast_util.h index e010274a75b..414feda9cbe 100644 --- a/util/cast_util.h +++ b/util/cast_util.h @@ -63,4 +63,26 @@ inline const std::initializer_list& List( return list; } +// UnownedPtr is useful as an efficient "optional reference" that can't +// be accidentally converted to std::shared_ptr nor std::unique_ptr. +template +class UnownedPtr { + public: + UnownedPtr() = default; + UnownedPtr(std::nullptr_t) {} + UnownedPtr(T* ptr) : ptr_(ptr) {} + UnownedPtr(const UnownedPtr&) = default; + UnownedPtr(UnownedPtr&&) = default; + UnownedPtr& operator=(const UnownedPtr&) = default; + UnownedPtr& operator=(UnownedPtr&&) = default; + + T* get() const { return ptr_; } + T* operator->() const { return ptr_; } + T& operator*() const { return *ptr_; } + operator bool() const { return ptr_ != nullptr; } + + private: + T* ptr_ = nullptr; +}; + } // namespace ROCKSDB_NAMESPACE diff --git a/util/coding.h b/util/coding.h index 3168fd2fd1f..929c8e42c46 100644 --- a/util/coding.h +++ b/util/coding.h @@ -34,40 +34,38 @@ namespace ROCKSDB_NAMESPACE { const uint32_t kMaxVarint64Length = 10; // Standard Put... routines append to a string -extern void PutFixed16(std::string* dst, uint16_t value); -extern void PutFixed32(std::string* dst, uint32_t value); -extern void PutFixed64(std::string* dst, uint64_t value); -extern void PutVarint32(std::string* dst, uint32_t value); -extern void PutVarint32Varint32(std::string* dst, uint32_t value1, - uint32_t value2); -extern void PutVarint32Varint32Varint32(std::string* dst, uint32_t value1, - uint32_t value2, uint32_t value3); -extern void PutVarint64(std::string* dst, uint64_t value); -extern void PutVarint64Varint64(std::string* dst, uint64_t value1, - uint64_t value2); -extern void PutVarint32Varint64(std::string* dst, uint32_t value1, - uint64_t value2); -extern void PutVarint32Varint32Varint64(std::string* dst, uint32_t value1, - uint32_t value2, uint64_t value3); -extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); -extern void PutLengthPrefixedSliceParts(std::string* dst, - const SliceParts& slice_parts); -extern void PutLengthPrefixedSlicePartsWithPadding( - std::string* dst, const SliceParts& slice_parts, size_t pad_sz); +void PutFixed16(std::string* dst, uint16_t value); +void PutFixed32(std::string* dst, uint32_t value); +void PutFixed64(std::string* dst, uint64_t value); +void PutVarint32(std::string* dst, uint32_t value); +void PutVarint32Varint32(std::string* dst, uint32_t value1, uint32_t value2); +void PutVarint32Varint32Varint32(std::string* dst, uint32_t value1, + uint32_t value2, uint32_t value3); +void PutVarint64(std::string* dst, uint64_t value); +void PutVarint64Varint64(std::string* dst, uint64_t value1, uint64_t value2); +void PutVarint32Varint64(std::string* dst, uint32_t value1, uint64_t value2); +void PutVarint32Varint32Varint64(std::string* dst, uint32_t value1, + uint32_t value2, uint64_t value3); +void PutLengthPrefixedSlice(std::string* dst, const Slice& value); +void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts); +void PutLengthPrefixedSlicePartsWithPadding(std::string* dst, + const SliceParts& slice_parts, + size_t pad_sz); // Standard Get... routines parse a value from the beginning of a Slice // and advance the slice past the parsed value. -extern bool GetFixed64(Slice* input, uint64_t* value); -extern bool GetFixed32(Slice* input, uint32_t* value); -extern bool GetFixed16(Slice* input, uint16_t* value); -extern bool GetVarint32(Slice* input, uint32_t* value); -extern bool GetVarint64(Slice* input, uint64_t* value); -extern bool GetVarsignedint64(Slice* input, int64_t* value); -extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); +bool GetFixed64(Slice* input, uint64_t* value); +bool GetFixed32(Slice* input, uint32_t* value); +bool GetFixed16(Slice* input, uint16_t* value); +bool GetVarint32(Slice* input, uint32_t* value); +bool GetVarint64(Slice* input, uint64_t* value); +bool GetVarsignedint64(Slice* input, int64_t* value); +bool GetLengthPrefixedSlice(Slice* input, Slice* result); // This function assumes data is well-formed. -extern Slice GetLengthPrefixedSlice(const char* data); +Slice GetLengthPrefixedSlice(const char* data); -extern Slice GetSliceUntil(Slice* slice, char delimiter); +Slice GetSliceUntil(Slice* slice, char delimiter); // Borrowed from // https://github.com/facebook/fbthrift/blob/449a5f77f9f9bae72c9eb5e78093247eef185c04/thrift/lib/cpp/util/VarintUtils-inl.h#L202-L208 @@ -82,10 +80,8 @@ inline int64_t zigzagToI64(uint64_t n) { // in *v and return a pointer just past the parsed value, or return // nullptr on error. These routines only look at bytes in the range // [p..limit-1] -extern const char* GetVarint32Ptr(const char* p, const char* limit, - uint32_t* v); -extern const char* GetVarint64Ptr(const char* p, const char* limit, - uint64_t* v); +const char* GetVarint32Ptr(const char* p, const char* limit, uint32_t* v); +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* v); inline const char* GetVarsignedint64Ptr(const char* p, const char* limit, int64_t* value) { uint64_t u = 0; @@ -95,17 +91,17 @@ inline const char* GetVarsignedint64Ptr(const char* p, const char* limit, } // Returns the length of the varint32 or varint64 encoding of "v" -extern int VarintLength(uint64_t v); +int VarintLength(uint64_t v); // Lower-level versions of Put... that write directly into a character buffer // and return a pointer just past the last byte written. // REQUIRES: dst has enough space for the value being written -extern char* EncodeVarint32(char* dst, uint32_t value); -extern char* EncodeVarint64(char* dst, uint64_t value); +char* EncodeVarint32(char* dst, uint32_t value); +char* EncodeVarint64(char* dst, uint64_t value); // Internal routine for use by fallback path of GetVarint32Ptr -extern const char* GetVarint32PtrFallback(const char* p, const char* limit, - uint32_t* value); +const char* GetVarint32PtrFallback(const char* p, const char* limit, + uint32_t* value); inline const char* GetVarint32Ptr(const char* p, const char* limit, uint32_t* value) { if (p < limit) { diff --git a/util/comparator.cc b/util/comparator.cc index 98ecef9d26b..a5d7a7ca0bd 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -9,9 +9,8 @@ #include "rocksdb/comparator.h" -#include - #include +#include #include #include #include @@ -30,7 +29,7 @@ namespace ROCKSDB_NAMESPACE { namespace { class BytewiseComparatorImpl : public Comparator { public: - BytewiseComparatorImpl() {} + BytewiseComparatorImpl() = default; static const char* kClassName() { return "leveldb.BytewiseComparator"; } const char* Name() const override { return kClassName(); } @@ -112,7 +111,9 @@ class BytewiseComparatorImpl : public Comparator { } size_t diff_ind = s.difference_offset(t); // same slice - if (diff_ind >= s.size()) return false; + if (diff_ind >= s.size()) { + return false; + } uint8_t byte_s = static_cast(s[diff_ind]); uint8_t byte_t = static_cast(t[diff_ind]); // first different byte must be consecutive, and remaining bytes must be @@ -148,7 +149,7 @@ class BytewiseComparatorImpl : public Comparator { class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { public: - ReverseBytewiseComparatorImpl() {} + ReverseBytewiseComparatorImpl() = default; static const char* kClassName() { return "rocksdb.ReverseBytewiseComparator"; diff --git a/util/compression_context_cache.cc b/util/compression_context_cache.cc index 52c3fac72ac..789cc7b624e 100644 --- a/util/compression_context_cache.cc +++ b/util/compression_context_cache.cc @@ -67,7 +67,7 @@ static_assert(sizeof(ZSTDCachedData) % CACHE_LINE_SIZE == 0, class CompressionContextCache::Rep { public: - Rep() {} + Rep() = default; ZSTDUncompressCachedData GetZSTDUncompressData() { auto p = per_core_uncompr_.AccessElementAndIndex(); int64_t idx = static_cast(p.second); diff --git a/util/concurrent_task_limiter_impl.h b/util/concurrent_task_limiter_impl.h index 4952ae23aa9..71e15ead342 100644 --- a/util/concurrent_task_limiter_impl.h +++ b/util/concurrent_task_limiter_impl.h @@ -29,13 +29,13 @@ class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter { virtual ~ConcurrentTaskLimiterImpl(); - virtual const std::string& GetName() const override; + const std::string& GetName() const override; - virtual void SetMaxOutstandingTask(int32_t limit) override; + void SetMaxOutstandingTask(int32_t limit) override; - virtual void ResetMaxOutstandingTask() override; + void ResetMaxOutstandingTask() override; - virtual int32_t GetOutstandingTask() const override; + int32_t GetOutstandingTask() const override; // Request token for adding a new task. // If force == true, it requests a token bypassing throttle. diff --git a/util/crc32c.cc b/util/crc32c.cc index 9e97045f447..c00276d765c 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -11,9 +11,8 @@ // four bytes at a time. #include "util/crc32c.h" -#include - #include +#include #include #include "port/lang.h" @@ -57,8 +56,7 @@ ASSERT_FEATURE_COMPAT_HEADER(); bool pmull_runtime_flag = false; #endif -namespace ROCKSDB_NAMESPACE { -namespace crc32c { +namespace ROCKSDB_NAMESPACE::crc32c { #if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) #ifdef __powerpc64__ @@ -1294,5 +1292,4 @@ uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len) { pure_crc2_with_init); } -} // namespace crc32c -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::crc32c diff --git a/util/crc32c.h b/util/crc32c.h index a08ad60af3f..81c27142fcf 100644 --- a/util/crc32c.h +++ b/util/crc32c.h @@ -18,18 +18,18 @@ namespace ROCKSDB_NAMESPACE { namespace crc32c { -extern std::string IsFastCrc32Supported(); +std::string IsFastCrc32Supported(); // Return the crc32c of concat(A, data[0,n-1]) where init_crc is the // crc32c of some string A. Extend() is often used to maintain the // crc32c of a stream of data. -extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); +uint32_t Extend(uint32_t init_crc, const char* data, size_t n); // Takes two unmasked crc32c values, and the length of the string from // which `crc2` was computed, and computes a crc32c value for the // concatenation of the original two input strings. Running time is // ~ log(crc2len). -extern uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len); +uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len); // Return the crc32c of data[0,n-1] inline uint32_t Value(const char* data, size_t n) { return Extend(0, data, n); } diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h index 4b27fe87108..5df3fa8d9de 100644 --- a/util/crc32c_arm64.h +++ b/util/crc32c_arm64.h @@ -36,10 +36,9 @@ PREF4X64L1(buffer, (PREF_OFFSET), 8) \ PREF4X64L1(buffer, (PREF_OFFSET), 12) -extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, - size_t len); -extern uint32_t crc32c_runtime_check(void); -extern bool crc32c_pmull_runtime_check(void); +uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len); +uint32_t crc32c_runtime_check(void); +bool crc32c_pmull_runtime_check(void); #ifdef __ARM_FEATURE_CRYPTO #define HAVE_ARM64_CRYPTO diff --git a/util/crc32c_ppc.h b/util/crc32c_ppc.h index f0b0b66d5db..365ba2c427a 100644 --- a/util/crc32c_ppc.h +++ b/util/crc32c_ppc.h @@ -14,8 +14,7 @@ extern "C" { #endif -extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer, - size_t len); +uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer, size_t len); #ifdef __cplusplus } diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc index 715d63e2de0..ecb43d7fb87 100644 --- a/util/crc32c_test.cc +++ b/util/crc32c_test.cc @@ -12,8 +12,7 @@ #include "util/coding.h" #include "util/random.h" -namespace ROCKSDB_NAMESPACE { -namespace crc32c { +namespace ROCKSDB_NAMESPACE::crc32c { class CRC {}; @@ -170,8 +169,7 @@ TEST(CRC, Crc32cCombineBigSizeTest) { ASSERT_EQ(crc1_2, crc1_2_combine); } -} // namespace crc32c -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::crc32c // copied from folly const uint64_t FNV_64_HASH_START = 14695981039346656037ULL; diff --git a/util/data_structure.cc b/util/data_structure.cc index d647df5d5b2..04d0442a5fa 100644 --- a/util/data_structure.cc +++ b/util/data_structure.cc @@ -7,12 +7,10 @@ #include "util/math.h" -namespace ROCKSDB_NAMESPACE { -namespace detail { +namespace ROCKSDB_NAMESPACE::detail { int CountTrailingZeroBitsForSmallEnumSet(uint64_t v) { return CountTrailingZeroBits(v); } -} // namespace detail -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::detail diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index 925c5479ab0..949ab8f76bb 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -164,10 +164,11 @@ TEST_F(DynamicBloomTest, VaryingLengths) { "%5.2f%% @ num = %6u, bloom_bits = %6u\n", nonseq ? "nonseq" : "seq", rate * 100.0, num, bloom_bits); - if (rate > 0.0125) + if (rate > 0.0125) { mediocre_filters++; // Allowed, but not too often - else + } else { good_filters++; + } } } diff --git a/util/file_checksum_helper.cc b/util/file_checksum_helper.cc index b8c4099b805..d6b29431fb0 100644 --- a/util/file_checksum_helper.cc +++ b/util/file_checksum_helper.cc @@ -31,7 +31,7 @@ Status FileChecksumListImpl::GetAllFileChecksums( return Status::InvalidArgument("Pointer has not been initiated"); } - for (auto i : checksum_map_) { + for (const auto& i : checksum_map_) { file_numbers->push_back(i.first); checksums->push_back(i.second.first); checksum_func_names->push_back(i.second.second); @@ -98,7 +98,7 @@ Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, return Status::InvalidArgument("checksum_list is nullptr"); } assert(checksum_list); - // TODO: plumb Env::IOActivity + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; checksum_list->reset(); Status s; @@ -118,7 +118,7 @@ Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, struct LogReporter : public log::Reader::Reporter { Status* status_ptr; - virtual void Corruption(size_t /*bytes*/, const Status& st) override { + void Corruption(size_t /*bytes*/, const Status& st) override { if (status_ptr->ok()) { *status_ptr = st; } diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc index 68776612b90..f317119e757 100644 --- a/util/file_reader_writer_test.cc +++ b/util/file_reader_writer_test.cc @@ -31,7 +31,7 @@ TEST_F(WritableFileWriterTest, RangeSync) { class FakeWF : public FSWritableFile { public: explicit FakeWF() : size_(0), last_synced_(0) {} - ~FakeWF() override {} + ~FakeWF() override = default; using FSWritableFile::Append; IOStatus Append(const Slice& data, const IOOptions& /*options*/, @@ -113,16 +113,16 @@ TEST_F(WritableFileWriterTest, RangeSync) { for (int i = 0; i < 1000; i++) { int skew_limit = (i < 700) ? 10 : 15; uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100); - s = writer->Append(Slice(large_buf.get(), num)); + s = writer->Append(IOOptions(), Slice(large_buf.get(), num)); ASSERT_OK(s); // Flush in a chance of 1/10. if (r.Uniform(10) == 0) { - s = writer->Flush(); + s = writer->Flush(IOOptions()); ASSERT_OK(s); } } - s = writer->Close(); + s = writer->Close(IOOptions()); ASSERT_OK(s); } @@ -134,7 +134,7 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) { : file_data_(_file_data), use_direct_io_(_use_direct_io), no_flush_(_no_flush) {} - ~FakeWF() override {} + ~FakeWF() override = default; using FSWritableFile::Append; IOStatus Append(const Slice& data, const IOOptions& /*options*/, @@ -215,16 +215,16 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) { for (int i = 0; i < 20; i++) { uint32_t num = r.Skewed(16) * 100 + r.Uniform(100); std::string random_string = r.RandomString(num); - ASSERT_OK(writer->Append(Slice(random_string.c_str(), num))); + ASSERT_OK(writer->Append(IOOptions(), Slice(random_string.c_str(), num))); target.append(random_string.c_str(), num); // In some attempts, flush in a chance of 1/10. if (!no_flush && r.Uniform(10) == 0) { - ASSERT_OK(writer->Flush()); + ASSERT_OK(writer->Flush(IOOptions())); } } - ASSERT_OK(writer->Flush()); - ASSERT_OK(writer->Close()); + ASSERT_OK(writer->Flush(IOOptions())); + ASSERT_OK(writer->Close(IOOptions())); ASSERT_EQ(target.size(), actual.size()); ASSERT_EQ(target, actual); } @@ -272,27 +272,28 @@ TEST_F(DBWritableFileWriterTest, AppendWithChecksum) { ImmutableOptions ioptions(options); file_writer.reset(new WritableFileWriter( std::move(file), fname, file_options, SystemClock::Default().get(), - nullptr, ioptions.stats, ioptions.listeners, - ioptions.file_checksum_gen_factory.get(), true, true)); + nullptr, ioptions.stats, Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, + ioptions.listeners, ioptions.file_checksum_gen_factory.get(), true, + true)); Random rnd(301); std::string data = rnd.RandomString(1000); uint32_t data_crc32c = crc32c::Value(data.c_str(), data.size()); fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); - - ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Append(IOOptions(), Slice(data.c_str()), data_crc32c)); + ASSERT_OK(file_writer->Flush(IOOptions())); Random size_r(47); for (int i = 0; i < 2000; i++) { data = rnd.RandomString((static_cast(size_r.Next()) % 10000)); data_crc32c = crc32c::Value(data.c_str(), data.size()); - ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + ASSERT_OK( + file_writer->Append(IOOptions(), Slice(data.c_str()), data_crc32c)); data = rnd.RandomString((static_cast(size_r.Next()) % 97)); - ASSERT_OK(file_writer->Append(Slice(data.c_str()))); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Append(IOOptions(), Slice(data.c_str()))); + ASSERT_OK(file_writer->Flush(IOOptions())); } - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); Destroy(options); } @@ -314,27 +315,29 @@ TEST_F(DBWritableFileWriterTest, AppendVerifyNoChecksum) { // So Append with checksum logic will not be triggered file_writer.reset(new WritableFileWriter( std::move(file), fname, file_options, SystemClock::Default().get(), - nullptr, ioptions.stats, ioptions.listeners, - ioptions.file_checksum_gen_factory.get(), true, false)); + nullptr, ioptions.stats, Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, + ioptions.listeners, ioptions.file_checksum_gen_factory.get(), true, + false)); Random rnd(301); std::string data = rnd.RandomString(1000); uint32_t data_crc32c = crc32c::Value(data.c_str(), data.size()); fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); - ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Append(IOOptions(), Slice(data.c_str()), data_crc32c)); + ASSERT_OK(file_writer->Flush(IOOptions())); Random size_r(47); for (int i = 0; i < 1000; i++) { data = rnd.RandomString((static_cast(size_r.Next()) % 10000)); data_crc32c = crc32c::Value(data.c_str(), data.size()); - ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + ASSERT_OK( + file_writer->Append(IOOptions(), Slice(data.c_str()), data_crc32c)); data = rnd.RandomString((static_cast(size_r.Next()) % 97)); - ASSERT_OK(file_writer->Append(Slice(data.c_str()))); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Append(IOOptions(), Slice(data.c_str()))); + ASSERT_OK(file_writer->Flush(IOOptions())); } - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); Destroy(options); } @@ -357,8 +360,9 @@ TEST_F(DBWritableFileWriterTest, AppendWithChecksumRateLimiter) { // So Append with checksum logic will not be triggered file_writer.reset(new WritableFileWriter( std::move(file), fname, file_options, SystemClock::Default().get(), - nullptr, ioptions.stats, ioptions.listeners, - ioptions.file_checksum_gen_factory.get(), true, true)); + nullptr, ioptions.stats, Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, + ioptions.listeners, ioptions.file_checksum_gen_factory.get(), true, + true)); fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); Random rnd(301); @@ -370,17 +374,18 @@ TEST_F(DBWritableFileWriterTest, AppendWithChecksumRateLimiter) { for (int i = 0; i < 100; i++) { data = rnd.RandomString((static_cast(size_r.Next()) % 10000)); data_crc32c = crc32c::Value(data.c_str(), data.size()); - ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + ASSERT_OK( + file_writer->Append(IOOptions(), Slice(data.c_str()), data_crc32c)); bytes_written += static_cast(data.size()); data = rnd.RandomString((static_cast(size_r.Next()) % 97)); - ASSERT_OK(file_writer->Append(Slice(data.c_str()))); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Append(IOOptions(), Slice(data.c_str()))); + ASSERT_OK(file_writer->Flush(IOOptions())); bytes_written += static_cast(data.size()); } uint64_t elapsed = fault_env_->NowMicros() - start; double raw_rate = bytes_written * 1000000.0 / elapsed; - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); // Set the rate-limiter FileOptions file_options1 = FileOptions(); @@ -397,19 +402,21 @@ TEST_F(DBWritableFileWriterTest, AppendWithChecksumRateLimiter) { // So Append with checksum logic will not be triggered file_writer.reset(new WritableFileWriter( std::move(file), fname, file_options1, SystemClock::Default().get(), - nullptr, ioptions.stats, ioptions.listeners, - ioptions.file_checksum_gen_factory.get(), true, true)); + nullptr, ioptions.stats, Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, + ioptions.listeners, ioptions.file_checksum_gen_factory.get(), true, + true)); for (int i = 0; i < 1000; i++) { data = rnd.RandomString((static_cast(size_r.Next()) % 10000)); data_crc32c = crc32c::Value(data.c_str(), data.size()); - ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + ASSERT_OK( + file_writer->Append(IOOptions(), Slice(data.c_str()), data_crc32c)); data = rnd.RandomString((static_cast(size_r.Next()) % 97)); - ASSERT_OK(file_writer->Append(Slice(data.c_str()))); - ASSERT_OK(file_writer->Flush()); + ASSERT_OK(file_writer->Append(IOOptions(), Slice(data.c_str()))); + ASSERT_OK(file_writer->Flush(IOOptions())); } - ASSERT_OK(file_writer->Close()); + ASSERT_OK(file_writer->Close(IOOptions())); if (file_options1.rate_limiter != nullptr) { delete file_options1.rate_limiter; } @@ -456,6 +463,11 @@ TEST_F(WritableFileWriterTest, AppendStatusReturn) { void Setuse_direct_io(bool val) { use_direct_io_ = val; } void SetIOError(bool val) { io_error_ = val; } + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return 0; + } + protected: bool use_direct_io_; bool io_error_; @@ -465,12 +477,12 @@ TEST_F(WritableFileWriterTest, AppendStatusReturn) { std::unique_ptr writer( new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions())); - ASSERT_OK(writer->Append(std::string(2 * kMb, 'a'))); + ASSERT_OK(writer->Append(IOOptions(), std::string(2 * kMb, 'a'))); // Next call to WritableFile::Append() should fail FakeWF* fwf = static_cast(writer->writable_file()); fwf->SetIOError(true); - ASSERT_NOK(writer->Append(std::string(2 * kMb, 'b'))); + ASSERT_NOK(writer->Append(IOOptions(), std::string(2 * kMb, 'b'))); } class ReadaheadRandomAccessFileTest @@ -498,9 +510,9 @@ class ReadaheadRandomAccessFileTest new test::StringSink(&control_contents_)); std::unique_ptr write_holder(new WritableFileWriter( std::move(sink), "" /* don't care */, FileOptions())); - Status s = write_holder->Append(Slice(str)); + Status s = write_holder->Append(IOOptions(), Slice(str)); EXPECT_OK(s); - s = write_holder->Flush(); + s = write_holder->Flush(IOOptions()); EXPECT_OK(s); std::unique_ptr read_holder( new test::StringSource(control_contents_)); @@ -588,7 +600,7 @@ class ReadaheadSequentialFileTest : public testing::Test, scratch_.reset(new char[2 * readahead_size_]); ResetSourceStr(); } - ReadaheadSequentialFileTest() {} + ReadaheadSequentialFileTest() = default; std::string Read(size_t n) { Slice result; Status s = test_read_holder_->Read( @@ -855,6 +867,11 @@ TEST_F(DBWritableFileWriterTest, IOErrorNotification) { ASSERT_EQ(file_flush_errors_, file_flush_errors); } + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return 0; + } + protected: bool io_error_; std::atomic file_append_errors_; @@ -878,26 +895,27 @@ TEST_F(DBWritableFileWriterTest, IOErrorNotification) { file_writer.reset(new WritableFileWriter( std::move(writable_file_ptr), fname, file_options, - SystemClock::Default().get(), nullptr, ioptions.stats, ioptions.listeners, + SystemClock::Default().get(), nullptr, ioptions.stats, + Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, ioptions.listeners, ioptions.file_checksum_gen_factory.get(), true, true)); FakeWF* fwf = static_cast(file_writer->writable_file()); fwf->SetIOError(true); - ASSERT_NOK(file_writer->Append(std::string(2 * kMb, 'a'))); + ASSERT_NOK(file_writer->Append(IOOptions(), std::string(2 * kMb, 'a'))); fwf->CheckCounters(1, 0); ASSERT_EQ(listener->NotifyErrorCount(), 1); file_writer->reset_seen_error(); fwf->SetIOError(true); - ASSERT_NOK(file_writer->Flush()); + ASSERT_NOK(file_writer->Flush(IOOptions())); fwf->CheckCounters(1, 1); ASSERT_EQ(listener->NotifyErrorCount(), 2); /* No error generation */ file_writer->reset_seen_error(); fwf->SetIOError(false); - ASSERT_OK(file_writer->Append(std::string(2 * kMb, 'b'))); + ASSERT_OK(file_writer->Append(IOOptions(), std::string(2 * kMb, 'b'))); ASSERT_EQ(listener->NotifyErrorCount(), 2); fwf->CheckCounters(1, 1); } @@ -919,7 +937,7 @@ class WritableFileWriterIOPriorityTest : public testing::Test { class FakeWF : public FSWritableFile { public: explicit FakeWF(Env::IOPriority io_priority) { SetIOPriority(io_priority); } - ~FakeWF() override {} + ~FakeWF() override = default; IOStatus Append(const Slice& /*data*/, const IOOptions& options, IODebugContext* /*dbg*/) override { @@ -1006,23 +1024,29 @@ class WritableFileWriterIOPriorityTest : public testing::Test { }; TEST_F(WritableFileWriterIOPriorityTest, Append) { - ASSERT_OK(writer_->Append(Slice("abc"))); + ASSERT_OK(writer_->Append(IOOptions(), Slice("abc"))); } -TEST_F(WritableFileWriterIOPriorityTest, Pad) { ASSERT_OK(writer_->Pad(500)); } +TEST_F(WritableFileWriterIOPriorityTest, Pad) { + ASSERT_OK(writer_->Pad(IOOptions(), 500)); +} -TEST_F(WritableFileWriterIOPriorityTest, Flush) { ASSERT_OK(writer_->Flush()); } +TEST_F(WritableFileWriterIOPriorityTest, Flush) { + ASSERT_OK(writer_->Flush(IOOptions())); +} -TEST_F(WritableFileWriterIOPriorityTest, Close) { ASSERT_OK(writer_->Close()); } +TEST_F(WritableFileWriterIOPriorityTest, Close) { + ASSERT_OK(writer_->Close(IOOptions())); +} TEST_F(WritableFileWriterIOPriorityTest, Sync) { - ASSERT_OK(writer_->Sync(false)); - ASSERT_OK(writer_->Sync(true)); + ASSERT_OK(writer_->Sync(IOOptions(), false)); + ASSERT_OK(writer_->Sync(IOOptions(), true)); } TEST_F(WritableFileWriterIOPriorityTest, SyncWithoutFlush) { - ASSERT_OK(writer_->SyncWithoutFlush(false)); - ASSERT_OK(writer_->SyncWithoutFlush(true)); + ASSERT_OK(writer_->SyncWithoutFlush(IOOptions(), false)); + ASSERT_OK(writer_->SyncWithoutFlush(IOOptions(), true)); } TEST_F(WritableFileWriterIOPriorityTest, BasicOp) { @@ -1037,16 +1061,16 @@ TEST_F(WritableFileWriterIOPriorityTest, BasicOp) { for (int i = 0; i < 1000; i++) { int skew_limit = (i < 700) ? 10 : 15; uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100); - s = writer->Append(Slice(large_buf.get(), num)); + s = writer->Append(IOOptions(), Slice(large_buf.get(), num)); ASSERT_OK(s); // Flush in a chance of 1/10. if (r.Uniform(10) == 0) { - s = writer->Flush(); + s = writer->Flush(IOOptions()); ASSERT_OK(s); } } - s = writer->Close(); + s = writer->Close(IOOptions()); ASSERT_OK(s); } } // namespace ROCKSDB_NAMESPACE diff --git a/util/filelock_test.cc b/util/filelock_test.cc index 82021aec99a..f779a8b2bad 100644 --- a/util/filelock_test.cc +++ b/util/filelock_test.cc @@ -34,7 +34,7 @@ class LockTest : public testing::Test { current_ = this; } - ~LockTest() override {} + ~LockTest() override = default; Status LockFile(FileLock** db_lock) { return env_->LockFile(file_, db_lock); } @@ -94,8 +94,8 @@ class LockTest : public testing::Test { } else if (pid > 0) { // parent process int status; - while (-1 == waitpid(pid, &status, 0)) - ; + while (-1 == waitpid(pid, &status, 0)) { + } if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { // child process exited with non success status return false; diff --git a/util/hash.h b/util/hash.h index 7a24659ad1c..63bc102de42 100644 --- a/util/hash.h +++ b/util/hash.h @@ -31,10 +31,10 @@ namespace ROCKSDB_NAMESPACE { // Hash(), especially for inputs > 24 bytes. // KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent // results from previous seed. Recommend incrementing by a large odd number. -extern uint64_t Hash64(const char* data, size_t n, uint64_t seed); +uint64_t Hash64(const char* data, size_t n, uint64_t seed); // Specific optimization without seed (same as seed = 0) -extern uint64_t Hash64(const char* data, size_t n); +uint64_t Hash64(const char* data, size_t n); // Non-persistent hash. Must only used for in-memory data structures. // The hash results are thus subject to change between releases, @@ -87,7 +87,7 @@ void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, // TODO: consider rename to Hash32 // KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent // results from previous seed. Recommend pseudorandom or hashed seeds. -extern uint32_t Hash(const char* data, size_t n, uint32_t seed); +uint32_t Hash(const char* data, size_t n, uint32_t seed); // TODO: consider rename to LegacyBloomHash32 inline uint32_t BloomHash(const Slice& key) { @@ -111,7 +111,7 @@ inline uint64_t GetSliceNPHash64(const Slice& s, uint64_t seed) { // Similar to `GetSliceNPHash64()` with `seed`, but input comes from // concatenation of `Slice`s in `data`. -extern uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed); +uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed); inline size_t GetSliceRangedNPHash(const Slice& s, size_t range) { return FastRange64(NPHash64(s.data(), s.size()), range); diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc index c1637db15d5..25602791ecf 100644 --- a/util/log_write_bench.cc +++ b/util/log_write_bench.cc @@ -41,9 +41,9 @@ void RunBenchmark() { std::unique_ptr file; env->NewWritableFile(file_name, &file, env_options); std::unique_ptr writer; - writer.reset(new WritableFileWriter(std::move(file), file_name, env_options, - clock, nullptr /* stats */, - options.listeners)); + writer.reset(new WritableFileWriter( + std::move(file), file_name, env_options, clock, nullptr /* stats */, + Histograms::HISTOGRAM_ENUM_MAX /* hist_type */, options.listeners)); std::string record; record.assign(FLAGS_record_size, 'X'); diff --git a/util/murmurhash.cc b/util/murmurhash.cc index a69f3918abe..41da76c5988 100644 --- a/util/murmurhash.cc +++ b/util/murmurhash.cc @@ -64,7 +64,7 @@ uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ) case 2: h ^= ((uint64_t)data2[1]) << 8; FALLTHROUGH_INTENDED; case 1: h ^= ((uint64_t)data2[0]); h *= m; - }; + } h ^= h >> r; h *= m; diff --git a/util/random.cc b/util/random.cc index 7ac6ee19a1b..17396d32ff3 100644 --- a/util/random.cc +++ b/util/random.cc @@ -6,10 +6,9 @@ #include "util/random.h" -#include -#include -#include - +#include +#include +#include #include #include diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc index e92b3bf7634..c2675a22225 100644 --- a/util/rate_limiter.cc +++ b/util/rate_limiter.cc @@ -46,13 +46,14 @@ struct GenericRateLimiter::Req { GenericRateLimiter::GenericRateLimiter( int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness, RateLimiter::Mode mode, const std::shared_ptr& clock, - bool auto_tuned) + bool auto_tuned, int64_t single_burst_bytes) : RateLimiter(mode), refill_period_us_(refill_period_us), rate_bytes_per_sec_(auto_tuned ? rate_bytes_per_sec / 2 : rate_bytes_per_sec), refill_bytes_per_period_( CalculateRefillBytesPerPeriodLocked(rate_bytes_per_sec_)), + raw_single_burst_bytes_(single_burst_bytes), clock_(clock), stop_(false), exit_cv_(&request_mutex_), @@ -108,25 +109,19 @@ void GenericRateLimiter::SetBytesPerSecondLocked(int64_t bytes_per_second) { } Status GenericRateLimiter::SetSingleBurstBytes(int64_t single_burst_bytes) { - if (single_burst_bytes <= 0) { + if (single_burst_bytes < 0) { return Status::InvalidArgument( - "`single_burst_bytes` must be greater than 0"); + "`single_burst_bytes` must be greater than or equal to 0"); } MutexLock g(&request_mutex_); - SetSingleBurstBytesLocked(single_burst_bytes); + raw_single_burst_bytes_.store(single_burst_bytes, std::memory_order_relaxed); return Status::OK(); } -void GenericRateLimiter::SetSingleBurstBytesLocked(int64_t single_burst_bytes) { - refill_bytes_per_period_.store(single_burst_bytes, std::memory_order_relaxed); - refill_period_us_.store(CalculateRefillPeriodUsLocked(single_burst_bytes), - std::memory_order_relaxed); -} - void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri, Statistics* stats) { - assert(bytes <= refill_bytes_per_period_.load(std::memory_order_relaxed)); + assert(bytes <= GetSingleBurstBytes()); bytes = std::max(static_cast(0), bytes); TEST_SYNC_POINT("GenericRateLimiter::Request"); TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:1", @@ -137,8 +132,7 @@ void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri, static const int kRefillsPerTune = 100; std::chrono::microseconds now(NowMicrosMonotonicLocked()); if (now - tuned_time_ >= - kRefillsPerTune * std::chrono::microseconds(refill_period_us_.load( - std::memory_order_relaxed))) { + kRefillsPerTune * std::chrono::microseconds(refill_period_us_)) { Status s = TuneLocked(); s.PermitUncheckedError(); //**TODO: What to do on error? } @@ -279,8 +273,7 @@ GenericRateLimiter::GeneratePriorityIterationOrderLocked() { void GenericRateLimiter::RefillBytesAndGrantRequestsLocked() { TEST_SYNC_POINT_CALLBACK( "GenericRateLimiter::RefillBytesAndGrantRequestsLocked", &request_mutex_); - next_refill_us_ = NowMicrosMonotonicLocked() + - refill_period_us_.load(std::memory_order_relaxed); + next_refill_us_ = NowMicrosMonotonicLocked() + refill_period_us_; // Carry over the left over quota from the last period auto refill_bytes_per_period = refill_bytes_per_period_.load(std::memory_order_relaxed); @@ -297,10 +290,13 @@ void GenericRateLimiter::RefillBytesAndGrantRequestsLocked() { while (!queue->empty()) { auto* next_req = queue->front(); if (available_bytes_ < next_req->request_bytes) { - // Grant partial request_bytes to avoid starvation of requests - // that become asking for more bytes than available_bytes_ - // due to dynamically reduced rate limiter's bytes_per_second that - // leads to reduced refill_bytes_per_period hence available_bytes_ + // Grant partial request_bytes even if request is for more than + // `available_bytes_`, which can happen in a few situations: + // + // - The available bytes were partially consumed by other request(s) + // - The rate was dynamically reduced while requests were already + // enqueued + // - The burst size was explicitly set to be larger than the refill size next_req->request_bytes -= available_bytes_; available_bytes_ = 0; break; @@ -318,28 +314,13 @@ void GenericRateLimiter::RefillBytesAndGrantRequestsLocked() { int64_t GenericRateLimiter::CalculateRefillBytesPerPeriodLocked( int64_t rate_bytes_per_sec) { - int64_t refill_period_us = refill_period_us_.load(std::memory_order_relaxed); if (std::numeric_limits::max() / rate_bytes_per_sec < - refill_period_us) { + refill_period_us_) { // Avoid unexpected result in the overflow case. The result now is still // inaccurate but is a number that is large enough. return std::numeric_limits::max() / kMicrosecondsPerSecond; } else { - return rate_bytes_per_sec * refill_period_us / kMicrosecondsPerSecond; - } -} - -int64_t GenericRateLimiter::CalculateRefillPeriodUsLocked( - int64_t single_burst_bytes) { - int64_t rate_bytes_per_sec = - rate_bytes_per_sec_.load(std::memory_order_relaxed); - if (std::numeric_limits::max() / single_burst_bytes < - kMicrosecondsPerSecond) { - // Avoid unexpected result in the overflow case. The result now is still - // inaccurate but is a number that is large enough. - return std::numeric_limits::max() / rate_bytes_per_sec; - } else { - return single_burst_bytes * kMicrosecondsPerSecond / rate_bytes_per_sec; + return rate_bytes_per_sec * refill_period_us_ / kMicrosecondsPerSecond; } } @@ -354,11 +335,10 @@ Status GenericRateLimiter::TuneLocked() { std::chrono::microseconds prev_tuned_time = tuned_time_; tuned_time_ = std::chrono::microseconds(NowMicrosMonotonicLocked()); - int64_t refill_period_us = refill_period_us_.load(std::memory_order_relaxed); int64_t elapsed_intervals = (tuned_time_ - prev_tuned_time + - std::chrono::microseconds(refill_period_us) - + std::chrono::microseconds(refill_period_us_) - std::chrono::microseconds(1)) / - std::chrono::microseconds(refill_period_us); + std::chrono::microseconds(refill_period_us_); // We tune every kRefillsPerTune intervals, so the overflow and division-by- // zero conditions should never happen. assert(num_drains_ <= std::numeric_limits::max() / 100); @@ -398,13 +378,13 @@ RateLimiter* NewGenericRateLimiter( int64_t rate_bytes_per_sec, int64_t refill_period_us /* = 100 * 1000 */, int32_t fairness /* = 10 */, RateLimiter::Mode mode /* = RateLimiter::Mode::kWritesOnly */, - bool auto_tuned /* = false */) { + bool auto_tuned /* = false */, int64_t single_burst_bytes /* = 0 */) { assert(rate_bytes_per_sec > 0); assert(refill_period_us > 0); assert(fairness > 0); - std::unique_ptr limiter( - new GenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness, - mode, SystemClock::Default(), auto_tuned)); + std::unique_ptr limiter(new GenericRateLimiter( + rate_bytes_per_sec, refill_period_us, fairness, mode, + SystemClock::Default(), auto_tuned, single_burst_bytes)); return limiter.release(); } diff --git a/util/rate_limiter_impl.h b/util/rate_limiter_impl.h index c6786b04857..23bbd56e081 100644 --- a/util/rate_limiter_impl.h +++ b/util/rate_limiter_impl.h @@ -28,29 +28,34 @@ class GenericRateLimiter : public RateLimiter { public: GenericRateLimiter(int64_t refill_bytes, int64_t refill_period_us, int32_t fairness, RateLimiter::Mode mode, - const std::shared_ptr& clock, - bool auto_tuned); + const std::shared_ptr& clock, bool auto_tuned, + int64_t single_burst_bytes); virtual ~GenericRateLimiter(); // This API allows user to dynamically change rate limiter's bytes per second. - virtual void SetBytesPerSecond(int64_t bytes_per_second) override; + void SetBytesPerSecond(int64_t bytes_per_second) override; - virtual Status SetSingleBurstBytes(int64_t single_burst_bytes) override; + Status SetSingleBurstBytes(int64_t single_burst_bytes) override; // Request for token to write bytes. If this request can not be satisfied, // the call is blocked. Caller is responsible to make sure // bytes <= GetSingleBurstBytes() and bytes >= 0. Negative bytes // passed in will be rounded up to 0. using RateLimiter::Request; - virtual void Request(const int64_t bytes, const Env::IOPriority pri, - Statistics* stats) override; - - virtual int64_t GetSingleBurstBytes() const override { - return refill_bytes_per_period_.load(std::memory_order_relaxed); + void Request(const int64_t bytes, const Env::IOPriority pri, + Statistics* stats) override; + + int64_t GetSingleBurstBytes() const override { + int64_t raw_single_burst_bytes = + raw_single_burst_bytes_.load(std::memory_order_relaxed); + if (raw_single_burst_bytes == 0) { + return refill_bytes_per_period_.load(std::memory_order_relaxed); + } + return raw_single_burst_bytes; } - virtual int64_t GetTotalBytesThrough( + int64_t GetTotalBytesThrough( const Env::IOPriority pri = Env::IO_TOTAL) const override { MutexLock g(&request_mutex_); if (pri == Env::IO_TOTAL) { @@ -63,7 +68,7 @@ class GenericRateLimiter : public RateLimiter { return total_bytes_through_[pri]; } - virtual int64_t GetTotalRequests( + int64_t GetTotalRequests( const Env::IOPriority pri = Env::IO_TOTAL) const override { MutexLock g(&request_mutex_); if (pri == Env::IO_TOTAL) { @@ -76,7 +81,7 @@ class GenericRateLimiter : public RateLimiter { return total_requests_[pri]; } - virtual Status GetTotalPendingRequests( + Status GetTotalPendingRequests( int64_t* total_pending_requests, const Env::IOPriority pri = Env::IO_TOTAL) const override { assert(total_pending_requests != nullptr); @@ -93,7 +98,7 @@ class GenericRateLimiter : public RateLimiter { return Status::OK(); } - virtual int64_t GetBytesPerSecond() const override { + int64_t GetBytesPerSecond() const override { return rate_bytes_per_sec_.load(std::memory_order_relaxed); } @@ -108,10 +113,8 @@ class GenericRateLimiter : public RateLimiter { void RefillBytesAndGrantRequestsLocked(); std::vector GeneratePriorityIterationOrderLocked(); int64_t CalculateRefillBytesPerPeriodLocked(int64_t rate_bytes_per_sec); - int64_t CalculateRefillPeriodUsLocked(int64_t single_burst_bytes); Status TuneLocked(); void SetBytesPerSecondLocked(int64_t bytes_per_second); - void SetSingleBurstBytesLocked(int64_t single_burst_bytes); uint64_t NowMicrosMonotonicLocked() { return clock_->NowNanos() / std::milli::den; @@ -120,10 +123,12 @@ class GenericRateLimiter : public RateLimiter { // This mutex guard all internal states mutable port::Mutex request_mutex_; - std::atomic refill_period_us_; + const int64_t refill_period_us_; std::atomic rate_bytes_per_sec_; std::atomic refill_bytes_per_period_; + // This value is validated but unsanitized (may be zero). + std::atomic raw_single_burst_bytes_; std::shared_ptr clock_; bool stop_; diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc index 16e7623ac8d..bae1e846276 100644 --- a/util/rate_limiter_test.cc +++ b/util/rate_limiter_test.cc @@ -35,7 +35,8 @@ class RateLimiterTest : public testing::Test { TEST_F(RateLimiterTest, OverflowRate) { GenericRateLimiter limiter(std::numeric_limits::max(), 1000, 10, RateLimiter::Mode::kWritesOnly, - SystemClock::Default(), false /* auto_tuned */); + SystemClock::Default(), false /* auto_tuned */, + 0 /* single_burst_bytes */); ASSERT_GT(limiter.GetSingleBurstBytes(), 1000000000ll); } @@ -160,10 +161,10 @@ TEST_F(RateLimiterTest, GetTotalPendingRequests) { TEST_F(RateLimiterTest, Modes) { for (auto mode : {RateLimiter::Mode::kWritesOnly, RateLimiter::Mode::kReadsOnly, RateLimiter::Mode::kAllIo}) { - GenericRateLimiter limiter(2000 /* rate_bytes_per_sec */, - 1000 * 1000 /* refill_period_us */, - 10 /* fairness */, mode, SystemClock::Default(), - false /* auto_tuned */); + GenericRateLimiter limiter( + 2000 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */, + 10 /* fairness */, mode, SystemClock::Default(), false /* auto_tuned */, + 0 /* single_burst_bytes */); limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); if (mode == RateLimiter::Mode::kWritesOnly) { @@ -347,8 +348,10 @@ TEST_F(RateLimiterTest, Rate) { // This can fail due to slow execution speed, like when using valgrind or in // heavily loaded CI environments bool skip_minimum_rate_check = -#if (defined(CIRCLECI) && defined(OS_MACOSX)) || defined(ROCKSDB_VALGRIND_RUN) +#if defined(ROCKSDB_VALGRIND_RUN) true; +#elif defined(OS_MACOSX) + getenv("CIRCLECI") || getenv("GITHUB_ACTIONS"); #else getenv("SANDCASTLE"); #endif @@ -387,7 +390,8 @@ TEST_F(RateLimiterTest, LimitChangeTest) { std::shared_ptr limiter = std::make_shared( target, refill_period, 10, RateLimiter::Mode::kWritesOnly, - SystemClock::Default(), false /* auto_tuned */); + SystemClock::Default(), false /* auto_tuned */, + 0 /* single_burst_bytes */); // After "GenericRateLimiter::Request:1" the mutex is held until the bytes // are refilled. This test could be improved to change the limit when lock // is released in `TimedWait()`. @@ -428,7 +432,7 @@ TEST_F(RateLimiterTest, AvailableByteSizeExhaustTest) { available_bytes_per_period, std::chrono::microseconds(kTimePerRefill).count(), 10 /* fairness */, RateLimiter::Mode::kWritesOnly, special_env.GetSystemClock(), - false /* auto_tuned */); + false /* auto_tuned */, 0 /* single_burst_bytes */); // Step 1. Request 100 and wait for the refill // so that the remaining available bytes are 400 @@ -472,7 +476,8 @@ TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) { std::unique_ptr rate_limiter(new GenericRateLimiter( 1000 /* rate_bytes_per_sec */, std::chrono::microseconds(kTimePerRefill).count(), 10 /* fairness */, - RateLimiter::Mode::kWritesOnly, mock_clock, true /* auto_tuned */)); + RateLimiter::Mode::kWritesOnly, mock_clock, true /* auto_tuned */, + 0 /* single_burst_bytes */)); // verify rate limit increases after a sequence of periods where rate limiter // is always drained @@ -517,7 +522,8 @@ TEST_F(RateLimiterTest, WaitHangingBug) { std::make_shared(Env::Default()->GetSystemClock()); std::unique_ptr limiter(new GenericRateLimiter( kBytesPerSecond, kMicrosPerRefill, 10 /* fairness */, - RateLimiter::Mode::kWritesOnly, mock_clock, false /* auto_tuned */)); + RateLimiter::Mode::kWritesOnly, mock_clock, false /* auto_tuned */, + 0 /* single_burst_bytes */)); std::array request_threads; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( @@ -557,29 +563,43 @@ TEST_F(RateLimiterTest, RuntimeSingleBurstBytesChange) { constexpr int kMicrosecondsPerSecond = 1000000; const int64_t kRateBytesPerSec = 400; + const int64_t kRefillBytes = 100; - const int64_t kOldSingleBurstBytes = 100; - const int64_t kOldRefillPeriodUs = - kOldSingleBurstBytes * kMicrosecondsPerSecond / kRateBytesPerSec; - const int64_t kNewSingleBurstBytes = kOldSingleBurstBytes * 2; + const int64_t kRefillPeriodMicros = + kRefillBytes * kMicrosecondsPerSecond / kRateBytesPerSec; - SpecialEnv special_env(Env::Default(), /*time_elapse_only_sleep*/ true); + const int64_t kRefillsPerBurst = 17; + const int64_t kBurstBytes = kRefillBytes * kRefillsPerBurst; + + auto mock_clock = + std::make_shared(Env::Default()->GetSystemClock()); + + // Zero as `single_burst_bytes` is a special value meaning the refill size std::unique_ptr limiter(new GenericRateLimiter( - kRateBytesPerSec, kOldRefillPeriodUs, 10 /* fairness */, - RateLimiter::Mode::kWritesOnly, special_env.GetSystemClock(), - false /* auto_tuned */)); + kRateBytesPerSec, kRefillPeriodMicros, 10 /* fairness */, + RateLimiter::Mode::kWritesOnly, mock_clock, false /* auto_tuned */, + 0 /* single_burst_bytes */)); + ASSERT_EQ(kRefillBytes, limiter->GetSingleBurstBytes()); + + // Dynamically setting to zero should change nothing + ASSERT_OK(limiter->SetSingleBurstBytes(0)); + ASSERT_EQ(kRefillBytes, limiter->GetSingleBurstBytes()); - ASSERT_EQ(kOldSingleBurstBytes, limiter->GetSingleBurstBytes()); + // Negative values are invalid and should change nothing + ASSERT_TRUE(limiter->SetSingleBurstBytes(-1).IsInvalidArgument()); + ASSERT_EQ(kRefillBytes, limiter->GetSingleBurstBytes()); - ASSERT_TRUE(limiter->SetSingleBurstBytes(0).IsInvalidArgument()); - ASSERT_OK(limiter->SetSingleBurstBytes(kNewSingleBurstBytes)); - ASSERT_EQ(kNewSingleBurstBytes, limiter->GetSingleBurstBytes()); + // Positive values take effect as the new burst size + ASSERT_OK(limiter->SetSingleBurstBytes(kBurstBytes)); + ASSERT_EQ(kBurstBytes, limiter->GetSingleBurstBytes()); - // If the updated single burst bytes is not reflected in the bytes - // granting process, this request will hang forever. + // Initially the supply is full so a request of size `kBurstBytes` needs + // `kRefillsPerBurst - 1` refill periods to elapse. limiter->Request(limiter->GetSingleBurstBytes() /* bytes */, Env::IOPriority::IO_USER, nullptr /* stats */, RateLimiter::OpType::kWrite); + ASSERT_EQ((kRefillsPerBurst - 1) * kRefillPeriodMicros, + mock_clock->NowMicros()); } } // namespace ROCKSDB_NAMESPACE diff --git a/util/repeatable_thread_test.cc b/util/repeatable_thread_test.cc index 0b3e95464be..fc3dcd85069 100644 --- a/util/repeatable_thread_test.cc +++ b/util/repeatable_thread_test.cc @@ -81,10 +81,9 @@ TEST_F(RepeatableThreadTest, MockEnvTest) { // time RepeatableThread::wait is called, it is no guarantee that the // delay + mock_clock->NowMicros will be greater than the current real // time. However, 1000 seconds should be sufficient in most cases. - uint64_t time_us = *reinterpret_cast(arg); + uint64_t time_us = *static_cast(arg); if (time_us < mock_clock_->RealNowMicros()) { - *reinterpret_cast(arg) = - mock_clock_->RealNowMicros() + 1000; + *static_cast(arg) = mock_clock_->RealNowMicros() + 1000; } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); diff --git a/util/ribbon_config.cc b/util/ribbon_config.cc index c1046f4aaa8..792a6d7c272 100644 --- a/util/ribbon_config.cc +++ b/util/ribbon_config.cc @@ -5,11 +5,7 @@ #include "util/ribbon_config.h" -namespace ROCKSDB_NAMESPACE { - -namespace ribbon { - -namespace detail { +namespace ROCKSDB_NAMESPACE::ribbon::detail { // Each instantiation of this struct is sufficiently unique for configuration // purposes, and is only instantiated for settings where we support the @@ -499,8 +495,4 @@ template struct BandingConfigHelper1MaybeSupported< template struct BandingConfigHelper1MaybeSupported; -} // namespace detail - -} // namespace ribbon - -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::ribbon::detail diff --git a/util/ribbon_impl.h b/util/ribbon_impl.h index 0afecc67dd1..7ab8b5148d1 100644 --- a/util/ribbon_impl.h +++ b/util/ribbon_impl.h @@ -168,11 +168,11 @@ class StandardHasher { inline Hash GetHash(const Key& key) const { return TypesAndSettings::HashFn(key, raw_seed_); - }; + } // For when AddInput == pair (kIsFilter == false) inline Hash GetHash(const std::pair& bi) const { return GetHash(bi.first); - }; + } inline Index GetStart(Hash h, Index num_starts) const { // This is "critical path" code because it's required before memory // lookup. diff --git a/util/slice.cc b/util/slice.cc index 22dd7ee6bb1..9ec0af132c2 100644 --- a/util/slice.cc +++ b/util/slice.cc @@ -9,9 +9,8 @@ #include "rocksdb/slice.h" -#include - #include +#include #include "rocksdb/convenience.h" #include "rocksdb/slice_transform.h" @@ -128,7 +127,7 @@ class CappedPrefixTransform : public SliceTransform { class NoopTransform : public SliceTransform { public: - explicit NoopTransform() {} + explicit NoopTransform() = default; static const char* kClassName() { return "rocksdb.Noop"; } const char* Name() const override { return kClassName(); } @@ -173,7 +172,7 @@ static int RegisterBuiltinSliceTransform(ObjectLibrary& library, .AddNumber(":"), [](const std::string& uri, std::unique_ptr* guard, std::string* /*errmsg*/) { - auto colon = uri.find(":"); + auto colon = uri.find(':'); auto len = ParseSizeT(uri.substr(colon + 1)); guard->reset(NewFixedPrefixTransform(len)); return guard->get(); @@ -193,7 +192,7 @@ static int RegisterBuiltinSliceTransform(ObjectLibrary& library, .AddNumber(":"), [](const std::string& uri, std::unique_ptr* guard, std::string* /*errmsg*/) { - auto colon = uri.find(":"); + auto colon = uri.find(':'); auto len = ParseSizeT(uri.substr(colon + 1)); guard->reset(NewCappedPrefixTransform(len)); return guard->get(); diff --git a/util/slice_test.cc b/util/slice_test.cc index e82547494b0..0028cce8596 100644 --- a/util/slice_test.cc +++ b/util/slice_test.cc @@ -13,6 +13,7 @@ #include "rocksdb/types.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { @@ -25,8 +26,8 @@ TEST(SliceTest, StringView) { // Use this to keep track of the cleanups that were actually performed void Multiplier(void* arg1, void* arg2) { - int* res = reinterpret_cast(arg1); - int* num = reinterpret_cast(arg2); + int* res = static_cast(arg1); + int* num = static_cast(arg2); *res *= *num; } @@ -169,8 +170,8 @@ TEST_F(PinnableSliceTest, Move) { // Unit test for SmallEnumSet class SmallEnumSetTest : public testing::Test { public: - SmallEnumSetTest() {} - ~SmallEnumSetTest() {} + SmallEnumSetTest() = default; + ~SmallEnumSetTest() = default; }; TEST_F(SmallEnumSetTest, SmallEnumSetTest1) { @@ -273,6 +274,71 @@ TEST(StatusTest, Update) { ASSERT_TRUE(s.IsNotFound()); } +// ***************************************************************** // +// Unit test for UnownedPtr +TEST(UnownedPtrTest, Tests) { + { + int x = 0; + UnownedPtr p(&x); + ASSERT_EQ(p.get(), &x); + ASSERT_EQ(*p, 0); + x = 1; + ASSERT_EQ(*p, 1); + ASSERT_EQ(p.get(), &x); + ASSERT_EQ(*p, 1); + *p = 2; + ASSERT_EQ(x, 2); + ASSERT_EQ(*p, 2); + ASSERT_EQ(p.get(), &x); + ASSERT_EQ(*p, 2); + } + { + std::unique_ptr> u = + std::make_unique>(); + *u = {1, 2}; + UnownedPtr> p; + ASSERT_FALSE(p); + p = u.get(); + ASSERT_TRUE(p); + ASSERT_EQ(p->first, 1); + // These must not compile: + /* + u = p; + u = std::move(p); + std::unique_ptr> v{p}; + std::unique_ptr> v{std::move(p)}; + */ + // END must not compile + + UnownedPtr> q; + q = std::move(p); + ASSERT_EQ(q->first, 1); + // Not committing to any moved-from semantics (on p here) + } + { + std::shared_ptr> s = + std::make_shared>(); + *s = {1, 2}; + UnownedPtr> p; + ASSERT_FALSE(p); + p = s.get(); + ASSERT_TRUE(p); + ASSERT_EQ(p->first, 1); + // These must not compile: + /* + s = p; + s = std::move(p); + std::unique_ptr> t{p}; + std::unique_ptr> t{std::move(p)}; + */ + // END must not compile + UnownedPtr> q; + q = std::move(p); + ASSERT_EQ(q->first, 1); + // Not committing to any moved-from semantics (on p here) + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/util/status.cc b/util/status.cc index 160755d54d7..8f49077406b 100644 --- a/util/status.cc +++ b/util/status.cc @@ -9,7 +9,7 @@ #include "rocksdb/status.h" -#include +#include #ifdef OS_WIN #include #endif diff --git a/util/stderr_logger.cc b/util/stderr_logger.cc index 6044b8b9367..69e9989f04e 100644 --- a/util/stderr_logger.cc +++ b/util/stderr_logger.cc @@ -6,10 +6,15 @@ #include "util/stderr_logger.h" +#include "port/malloc.h" #include "port/sys_time.h" namespace ROCKSDB_NAMESPACE { -StderrLogger::~StderrLogger() {} +StderrLogger::~StderrLogger() { + if (log_prefix != nullptr) { + free((void*)log_prefix); + } +} void StderrLogger::Logv(const char* format, va_list ap) { const uint64_t thread_id = Env::Default()->GetThreadID(); @@ -19,12 +24,40 @@ void StderrLogger::Logv(const char* format, va_list ap) { const time_t seconds = now_tv.tv_sec; struct tm t; port::LocalTimeR(&seconds, &t); - fprintf(stderr, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", t.tm_year + 1900, - t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec, - static_cast(now_tv.tv_usec), - static_cast(thread_id)); - vfprintf(stderr, format, ap); - fprintf(stderr, "\n"); + // The string we eventually log has three parts: the context (time, thread), + // optional user-supplied prefix, and the actual log message (the "suffix"). + // + // We compute their lengths so that we can allocate a buffer big enough to + // print it. The context string (with the date and thread id) is really only + // 44 bytes, but we allocate 50 to be safe. + // + // ctx_len = 44 = ( 4+ 1+ 2+1+2+ 1+2+ 1+2+ 1+ 2+1+6+ 1+16+1) + const char* ctx_prefix_fmt = "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx %s"; + size_t ctx_len = 50; + + va_list ap_copy; + va_copy(ap_copy, ap); + const size_t log_suffix_len = vsnprintf(nullptr, 0, format, ap_copy); + va_end(ap_copy); + + // Allocate space for the context, log_prefix, and log itself + // Extra byte for null termination + size_t buf_len = ctx_len + log_prefix_len + log_suffix_len + 1; + std::unique_ptr buf(new char[buf_len]); + + // If the logger was created without a prefix, the prefix is a nullptr + const char* prefix = log_prefix == nullptr ? "" : log_prefix; + + // Write out the context and prefix string + int written = + snprintf(buf.get(), ctx_len + log_prefix_len, ctx_prefix_fmt, + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, + t.tm_sec, static_cast(now_tv.tv_usec), + static_cast(thread_id), prefix); + written += vsnprintf(buf.get() + written, log_suffix_len, format, ap); + buf[written] = '\0'; + + fprintf(stderr, "%s%c", buf.get(), '\n'); } } // namespace ROCKSDB_NAMESPACE diff --git a/util/stderr_logger.h b/util/stderr_logger.h index c3b01210cea..6a3fe3bd798 100644 --- a/util/stderr_logger.h +++ b/util/stderr_logger.h @@ -17,7 +17,11 @@ namespace ROCKSDB_NAMESPACE { class StderrLogger : public Logger { public: explicit StderrLogger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) - : Logger(log_level) {} + : Logger(log_level), log_prefix(nullptr) {} + explicit StderrLogger(const InfoLogLevel log_level, const std::string prefix) + : Logger(log_level), + log_prefix(strdup(prefix.c_str())), + log_prefix_len(strlen(log_prefix)) {} ~StderrLogger() override; @@ -25,7 +29,13 @@ class StderrLogger : public Logger { // a subset of them. using Logger::Logv; - virtual void Logv(const char* format, va_list ap) override; + void Logv(const char* format, va_list ap) override; + + private: + // This prefix will be appended after the time/thread info of every log + const char* log_prefix; + // The length of the log_prefix + size_t log_prefix_len; }; } // namespace ROCKSDB_NAMESPACE diff --git a/util/string_util.cc b/util/string_util.cc index 57207889f1a..6e99723c18f 100644 --- a/util/string_util.cc +++ b/util/string_util.cc @@ -5,13 +5,12 @@ // #include "util/string_util.h" -#include -#include -#include - #include +#include #include #include +#include +#include #include #include #include @@ -115,8 +114,17 @@ void AppendEscapedStringTo(std::string* str, const Slice& value) { } std::string NumberToHumanString(int64_t num) { - char buf[19]; - int64_t absnum = num < 0 ? -num : num; + char buf[21]; + int64_t absnum; + + if (num < 0) { + // abs(INT64_MIN) is INT64_MAX+1 which overflows int64_t and become itself. + // So we convert it to INT64_MAX to avoid fall into <10000 slot. + absnum = num == INT64_MIN ? INT64_MAX : -num; + } else { + absnum = num; + } + if (absnum < 10000) { snprintf(buf, sizeof(buf), "%" PRIi64, num); } else if (absnum < 10000000) { @@ -257,7 +265,9 @@ std::string UnescapeOptionString(const std::string& escaped_string) { } std::string trim(const std::string& str) { - if (str.empty()) return std::string(); + if (str.empty()) { + return std::string(); + } size_t start = 0; size_t end = str.size() - 1; while (isspace(str[start]) != 0 && start < end) { @@ -337,14 +347,15 @@ uint64_t ParseUint64(const std::string& value) { if (endchar < value.length()) { char c = value[endchar]; - if (c == 'k' || c == 'K') + if (c == 'k' || c == 'K') { num <<= 10LL; - else if (c == 'm' || c == 'M') + } else if (c == 'm' || c == 'M') { num <<= 20LL; - else if (c == 'g' || c == 'G') + } else if (c == 'g' || c == 'G') { num <<= 30LL; - else if (c == 't' || c == 'T') + } else if (c == 't' || c == 'T') { num <<= 40LL; + } } return num; @@ -362,14 +373,15 @@ int64_t ParseInt64(const std::string& value) { if (endchar < value.length()) { char c = value[endchar]; - if (c == 'k' || c == 'K') + if (c == 'k' || c == 'K') { num <<= 10LL; - else if (c == 'm' || c == 'M') + } else if (c == 'm' || c == 'M') { num <<= 20LL; - else if (c == 'g' || c == 'G') + } else if (c == 'g' || c == 'G') { num <<= 30LL; - else if (c == 't' || c == 'T') + } else if (c == 't' || c == 'T') { num <<= 40LL; + } } return num; @@ -387,12 +399,13 @@ int ParseInt(const std::string& value) { if (endchar < value.length()) { char c = value[endchar]; - if (c == 'k' || c == 'K') + if (c == 'k' || c == 'K') { num <<= 10; - else if (c == 'm' || c == 'M') + } else if (c == 'm' || c == 'M') { num <<= 20; - else if (c == 'g' || c == 'G') + } else if (c == 'g' || c == 'G') { num <<= 30; + } } return num; diff --git a/util/string_util.h b/util/string_util.h index 999081ebba9..1374642a6cd 100644 --- a/util/string_util.h +++ b/util/string_util.h @@ -18,14 +18,14 @@ namespace ROCKSDB_NAMESPACE { class Slice; -extern std::vector StringSplit(const std::string& arg, char delim); +std::vector StringSplit(const std::string& arg, char delim); // Append a human-readable printout of "num" to *str -extern void AppendNumberTo(std::string* str, uint64_t num); +void AppendNumberTo(std::string* str, uint64_t num); // Append a human-readable printout of "value" to *str. // Escapes any non-printable characters found in "value". -extern void AppendEscapedStringTo(std::string* str, const Slice& value); +void AppendEscapedStringTo(std::string* str, const Slice& value); // Put n digits from v in base kBase to (*buf)[0] to (*buf)[n-1] and // advance *buf to the position after what was written. @@ -73,15 +73,15 @@ inline bool ParseBaseChars(const char** buf, size_t n, uint64_t* v) { // for num >= 10.000, prints "xxK" // for num >= 10.000.000, prints "xxM" // for num >= 10.000.000.000, prints "xxG" -extern std::string NumberToHumanString(int64_t num); +std::string NumberToHumanString(int64_t num); // Return a human-readable version of bytes // ex: 1048576 -> 1.00 GB -extern std::string BytesToHumanString(uint64_t bytes); +std::string BytesToHumanString(uint64_t bytes); // Return a human-readable version of unix time // ex: 1562116015 -> "Tue Jul 2 18:06:55 2019" -extern std::string TimeToHumanString(int unixtime); +std::string TimeToHumanString(int unixtime); // Append a human-readable time in micros. int AppendHumanMicros(uint64_t micros, char* output, int len, @@ -92,13 +92,13 @@ int AppendHumanBytes(uint64_t bytes, char* output, int len); // Return a human-readable version of "value". // Escapes any non-printable characters found in "value". -extern std::string EscapeString(const Slice& value); +std::string EscapeString(const Slice& value); // Parse a human-readable number from "*in" into *value. On success, // advances "*in" past the consumed number and sets "*val" to the // numeric value. Otherwise, returns false and leaves *in in an // unspecified state. -extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); +bool ConsumeDecimalNumber(Slice* in, uint64_t* val); // Returns true if the input char "c" is considered as a special character // that will be escaped when EscapeOptionString() is called. @@ -180,6 +180,6 @@ extern const std::string kNullptrString; // errnoStr() function returns a string that describes the error code passed in // the argument err -extern std::string errnoStr(int err); +std::string errnoStr(int err); } // namespace ROCKSDB_NAMESPACE diff --git a/util/string_util_test.cc b/util/string_util_test.cc new file mode 100644 index 00000000000..54dd35ae5fd --- /dev/null +++ b/util/string_util_test.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "string_util.h" + +#include +#include + +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +TEST(StringUtilTest, NumberToHumanString) { + ASSERT_EQ("-9223372036G", NumberToHumanString(INT64_MIN)); + ASSERT_EQ("9223372036G", NumberToHumanString(INT64_MAX)); + ASSERT_EQ("0", NumberToHumanString(0)); + ASSERT_EQ("9999", NumberToHumanString(9999)); + ASSERT_EQ("10K", NumberToHumanString(10000)); + ASSERT_EQ("10M", NumberToHumanString(10000000)); + ASSERT_EQ("10G", NumberToHumanString(10000000000)); + ASSERT_EQ("-9999", NumberToHumanString(-9999)); + ASSERT_EQ("-10K", NumberToHumanString(-10000)); + ASSERT_EQ("-10M", NumberToHumanString(-10000000)); + ASSERT_EQ("-10G", NumberToHumanString(-10000000000)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc index b5b3378fab9..4899b98ac4d 100644 --- a/util/thread_list_test.cc +++ b/util/thread_list_test.cc @@ -79,7 +79,7 @@ class SimulatedBackgroundTask { } static void DoSimulatedTask(void* arg) { - reinterpret_cast(arg)->Run(); + static_cast(arg)->Run(); } private: @@ -97,7 +97,7 @@ class SimulatedBackgroundTask { class ThreadListTest : public testing::Test { public: - ThreadListTest() {} + ThreadListTest() = default; }; TEST_F(ThreadListTest, GlobalTables) { @@ -161,7 +161,7 @@ TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) { // Verify the number of running threads in each pool. ASSERT_OK(env->GetThreadList(&thread_list)); int running_count[ThreadStatus::NUM_THREAD_TYPES] = {0}; - for (auto thread_status : thread_list) { + for (const auto& thread_status : thread_list) { if (thread_status.cf_name == "pikachu" && thread_status.db_name == "running") { running_count[thread_status.thread_type]++; @@ -189,7 +189,7 @@ TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) { for (int i = 0; i < ThreadStatus::NUM_THREAD_TYPES; ++i) { running_count[i] = 0; } - for (auto thread_status : thread_list) { + for (const auto& thread_status : thread_list) { if (thread_status.cf_name == "pikachu" && thread_status.db_name == "running") { running_count[thread_status.thread_type]++; @@ -204,7 +204,7 @@ TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) { namespace { void UpdateStatusCounts(const std::vector& thread_list, int operation_counts[], int state_counts[]) { - for (auto thread_status : thread_list) { + for (const auto& thread_status : thread_list) { operation_counts[thread_status.operation_type]++; state_counts[thread_status.state_type]++; } diff --git a/util/thread_local.cc b/util/thread_local.cc index 969639d9bc9..805a0aad634 100644 --- a/util/thread_local.cc +++ b/util/thread_local.cc @@ -9,7 +9,7 @@ #include "util/thread_local.h" -#include +#include #include "port/likely.h" #include "util/mutexlock.h" diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc index 09706cac57d..8397c4b3907 100644 --- a/util/threadpool_imp.cc +++ b/util/threadpool_imp.cc @@ -18,11 +18,10 @@ #include #endif -#include - #include #include #include +#include #include #include #include @@ -322,7 +321,7 @@ struct BGThreadMetadata { }; void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) { - BGThreadMetadata* meta = reinterpret_cast(arg); + BGThreadMetadata* meta = static_cast(arg); size_t thread_id = meta->thread_id_; ThreadPoolImpl::Impl* tp = meta->thread_pool_; #ifdef ROCKSDB_USING_THREAD_STATUS @@ -465,7 +464,7 @@ int ThreadPoolImpl::Impl::UnSchedule(void* arg) { ThreadPoolImpl::ThreadPoolImpl() : impl_(new Impl()) {} -ThreadPoolImpl::~ThreadPoolImpl() {} +ThreadPoolImpl::~ThreadPoolImpl() = default; void ThreadPoolImpl::JoinAllThreads() { impl_->JoinThreads(false); } diff --git a/util/threadpool_imp.h b/util/threadpool_imp.h index a5109e38f51..a3a3a39a264 100644 --- a/util/threadpool_imp.h +++ b/util/threadpool_imp.h @@ -101,7 +101,7 @@ class ThreadPoolImpl : public ThreadPool { struct Impl; private: - // Current public virtual interface does not provide usable + // Current public interface does not provide usable // functionality and thus can not be used internally to // facade different implementations. // diff --git a/util/udt_util_test.cc b/util/udt_util_test.cc index 44ee567f744..8f45d564a5b 100644 --- a/util/udt_util_test.cc +++ b/util/udt_util_test.cc @@ -20,16 +20,16 @@ static const std::string kValuePlaceHolder = "value"; class HandleTimestampSizeDifferenceTest : public testing::Test { public: - HandleTimestampSizeDifferenceTest() {} + HandleTimestampSizeDifferenceTest() = default; // Test handler used to collect the column family id and user keys contained // in a WriteBatch for test verification. And verifies the value part stays // the same if it's available. class KeyCollector : public WriteBatch::Handler { public: - explicit KeyCollector() {} + explicit KeyCollector() = default; - ~KeyCollector() override {} + ~KeyCollector() override = default; Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override { if (value.compare(kValuePlaceHolder) != 0) { @@ -90,7 +90,7 @@ class HandleTimestampSizeDifferenceTest : public testing::Test { private: Status AddKey(uint32_t cf, const Slice& key) { - keys_.push_back(std::make_pair(cf, key)); + keys_.emplace_back(cf, key); return Status::OK(); } std::vector> keys_; diff --git a/util/vector_iterator.h b/util/vector_iterator.h index c4cc01d561e..fccc23bf906 100644 --- a/util/vector_iterator.h +++ b/util/vector_iterator.h @@ -33,14 +33,14 @@ class VectorIterator : public InternalIterator { } } - virtual bool Valid() const override { + bool Valid() const override { return !indices_.empty() && current_ < indices_.size(); } - virtual void SeekToFirst() override { current_ = 0; } - virtual void SeekToLast() override { current_ = indices_.size() - 1; } + void SeekToFirst() override { current_ = 0; } + void SeekToLast() override { current_ = indices_.size() - 1; } - virtual void Seek(const Slice& target) override { + void Seek(const Slice& target) override { if (indexed_cmp_.cmp != nullptr) { current_ = std::lower_bound(indices_.begin(), indices_.end(), target, indexed_cmp_) - @@ -52,7 +52,7 @@ class VectorIterator : public InternalIterator { } } - virtual void SeekForPrev(const Slice& target) override { + void SeekForPrev(const Slice& target) override { if (indexed_cmp_.cmp != nullptr) { current_ = std::upper_bound(indices_.begin(), indices_.end(), target, indexed_cmp_) - @@ -69,20 +69,16 @@ class VectorIterator : public InternalIterator { } } - virtual void Next() override { current_++; } - virtual void Prev() override { current_--; } + void Next() override { current_++; } + void Prev() override { current_--; } - virtual Slice key() const override { - return Slice(keys_[indices_[current_]]); - } - virtual Slice value() const override { - return Slice(values_[indices_[current_]]); - } + Slice key() const override { return Slice(keys_[indices_[current_]]); } + Slice value() const override { return Slice(values_[indices_[current_]]); } - virtual Status status() const override { return Status::OK(); } + Status status() const override { return Status::OK(); } - virtual bool IsKeyPinned() const override { return true; } - virtual bool IsValuePinned() const override { return true; } + bool IsKeyPinned() const override { return true; } + bool IsValuePinned() const override { return true; } protected: std::vector keys_; diff --git a/util/write_batch_util.h b/util/write_batch_util.h index 70bbad9fc78..6986d25d0d0 100644 --- a/util/write_batch_util.h +++ b/util/write_batch_util.h @@ -32,6 +32,11 @@ class ColumnFamilyCollector : public WriteBatch::Handler { return AddColumnFamilyId(column_family_id); } + Status TimedPutCF(uint32_t column_family_id, const Slice&, const Slice&, + uint64_t) override { + return AddColumnFamilyId(column_family_id); + } + Status DeleteCF(uint32_t column_family_id, const Slice&) override { return AddColumnFamilyId(column_family_id); } diff --git a/util/xxhash.h b/util/xxhash.h index 2b9c228835f..bd31cfafbc4 100644 --- a/util/xxhash.h +++ b/util/xxhash.h @@ -2329,7 +2329,7 @@ XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ } while (0) - if (ptr==NULL) XXH_ASSERT(len == 0); + if (ptr==NULL) XXH_ASSERT(len == 0) /* Compact rerolled version; generally faster */ if (!XXH32_ENDJMP) { @@ -2383,7 +2383,7 @@ XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) XXH_FALLTHROUGH; /* fallthrough */ case 0: return XXH32_avalanche(hash); } - XXH_ASSERT(0); + XXH_ASSERT(0) return hash; /* reaching this point is deemed impossible */ } } @@ -2409,7 +2409,7 @@ XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment { xxh_u32 h32; - if (input==NULL) XXH_ASSERT(len == 0); + if (input==NULL) XXH_ASSERT(len == 0) if (len>=16) { const xxh_u8* const bEnd = input + len; @@ -2481,7 +2481,7 @@ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t /*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) { - XXH_ASSERT(statePtr != NULL); + XXH_ASSERT(statePtr != NULL) memset(statePtr, 0, sizeof(*statePtr)); statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; statePtr->v[1] = seed + XXH_PRIME32_2; @@ -2496,7 +2496,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* state, const void* input, size_t len) { if (input==NULL) { - XXH_ASSERT(len == 0); + XXH_ASSERT(len == 0) return XXH_OK; } @@ -2802,7 +2802,7 @@ static xxh_u64 XXH64_avalanche(xxh_u64 hash) static XXH_PUREF xxh_u64 XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) { - if (ptr==NULL) XXH_ASSERT(len == 0); + if (ptr==NULL) XXH_ASSERT(len == 0) len &= 31; while (len >= 8) { xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); @@ -2847,7 +2847,7 @@ XXH_FORCE_INLINE XXH_PUREF xxh_u64 XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) { xxh_u64 h64; - if (input==NULL) XXH_ASSERT(len == 0); + if (input==NULL) XXH_ASSERT(len == 0) if (len>=32) { const xxh_u8* const bEnd = input + len; @@ -2923,7 +2923,7 @@ XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const /*! @ingroup XXH64_family */ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) { - XXH_ASSERT(statePtr != NULL); + XXH_ASSERT(statePtr != NULL) memset(statePtr, 0, sizeof(*statePtr)); statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; statePtr->v[1] = seed + XXH_PRIME64_2; @@ -2937,7 +2937,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) { if (input==NULL) { - XXH_ASSERT(len == 0); + XXH_ASSERT(len == 0) return XXH_OK; } @@ -3849,7 +3849,7 @@ XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) /*! Seems to produce slightly better code on GCC for some reason. */ XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) { - XXH_ASSERT(0 <= shift && shift < 64); + XXH_ASSERT(0 <= shift && shift < 64) return v64 ^ (v64 >> shift); } @@ -3917,9 +3917,9 @@ static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { - XXH_ASSERT(input != NULL); - XXH_ASSERT(1 <= len && len <= 3); - XXH_ASSERT(secret != NULL); + XXH_ASSERT(input != NULL) + XXH_ASSERT(1 <= len && len <= 3) + XXH_ASSERT(secret != NULL) /* * len = 1: combined = { input[0], 0x01, input[0], input[0] } * len = 2: combined = { input[1], 0x02, input[0], input[1] } @@ -3939,9 +3939,9 @@ XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(4 <= len && len <= 8); + XXH_ASSERT(input != NULL) + XXH_ASSERT(secret != NULL) + XXH_ASSERT(4 <= len && len <= 8) seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; { xxh_u32 const input1 = XXH_readLE32(input); xxh_u32 const input2 = XXH_readLE32(input + len - 4); @@ -3955,9 +3955,9 @@ XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(9 <= len && len <= 16); + XXH_ASSERT(input != NULL) + XXH_ASSERT(secret != NULL) + XXH_ASSERT(9 <= len && len <= 16) { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; @@ -3972,7 +3972,7 @@ XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_ XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { - XXH_ASSERT(len <= 16); + XXH_ASSERT(len <= 16) { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); if (len) return XXH3_len_1to3_64b(input, len, secret, seed); @@ -4044,8 +4044,8 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) { - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(16 < len && len <= 128); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN) (void)secretSize; + XXH_ASSERT(16 < len && len <= 128) { xxh_u64 acc = len * XXH_PRIME64_1, acc_end; #if XXH_SIZE_OPT >= 1 @@ -4084,8 +4084,8 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) { - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN) (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX) #define XXH3_MIDSIZE_STARTOFFSET 3 #define XXH3_MIDSIZE_LASTOFFSET 17 @@ -4094,13 +4094,13 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, xxh_u64 acc_end; unsigned int const nbRounds = (unsigned int)len / 16; unsigned int i; - XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX) for (i=0; i<8; i++) { acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); } /* last bytes */ acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); - XXH_ASSERT(nbRounds >= 8); + XXH_ASSERT(nbRounds >= 8) acc = XXH3_avalanche(acc); #if defined(__clang__) /* Clang */ \ && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ @@ -4248,7 +4248,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { __m512i* const xacc = (__m512i *) acc; - XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_ASSERT((((size_t)acc) & 63) == 0) XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); { @@ -4295,7 +4295,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { - XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_ASSERT((((size_t)acc) & 63) == 0) XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); { __m512i* const xacc = (__m512i*) acc; const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); @@ -4320,7 +4320,7 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) { XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); - XXH_ASSERT(((size_t)customSecret & 63) == 0); + XXH_ASSERT(((size_t)customSecret & 63) == 0) (void)(&XXH_writeLE64); { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); @@ -4329,8 +4329,8 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); __m512i* const dest = ( __m512i*) customSecret; int i; - XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ - XXH_ASSERT(((size_t)dest & 63) == 0); + XXH_ASSERT(((size_t)src & 63) == 0) /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0) for (i=0; i < nbRounds; ++i) { dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); } } @@ -4350,7 +4350,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret) { - XXH_ASSERT((((size_t)acc) & 31) == 0); + XXH_ASSERT((((size_t)acc) & 31) == 0) { __m256i* const xacc = (__m256i *) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ @@ -4383,7 +4383,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { - XXH_ASSERT((((size_t)acc) & 31) == 0); + XXH_ASSERT((((size_t)acc) & 31) == 0) { __m256i* const xacc = (__m256i*) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ @@ -4429,8 +4429,8 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR */ XXH_COMPILER_GUARD(dest); # endif - XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ - XXH_ASSERT(((size_t)dest & 31) == 0); + XXH_ASSERT(((size_t)src & 31) == 0) /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0) /* GCC -O2 need unroll loop manually */ dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); @@ -4457,7 +4457,7 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { /* SSE2 is just a half-scale version of the AVX2 version. */ - XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_ASSERT((((size_t)acc) & 15) == 0) { __m128i* const xacc = (__m128i *) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ @@ -4490,7 +4490,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { - XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_ASSERT((((size_t)acc) & 15) == 0) { __m128i* const xacc = (__m128i*) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ @@ -4541,8 +4541,8 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTR */ XXH_COMPILER_GUARD(dst16); # endif - XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ - XXH_ASSERT(((size_t)dst16 & 15) == 0); + XXH_ASSERT(((size_t)src16 & 15) == 0) /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0) for (i=0; i < nbRounds; ++i) { dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); @@ -4577,7 +4577,7 @@ XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret) { - XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_ASSERT((((size_t)acc) & 15) == 0) XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); { uint64x2_t* const xacc = (uint64x2_t *) acc; @@ -4663,7 +4663,7 @@ XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { - XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_ASSERT((((size_t)acc) & 15) == 0) { uint64x2_t* xacc = (uint64x2_t*) acc; uint8_t const* xsecret = (uint8_t const*) secret; @@ -4762,7 +4762,7 @@ XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { - XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_ASSERT((((size_t)acc) & 15) == 0) { xxh_u64x2* const xacc = (xxh_u64x2*) acc; const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; @@ -4918,8 +4918,8 @@ XXH3_scalarRound(void* XXH_RESTRICT acc, xxh_u64* xacc = (xxh_u64*) acc; xxh_u8 const* xinput = (xxh_u8 const*) input; xxh_u8 const* xsecret = (xxh_u8 const*) secret; - XXH_ASSERT(lane < XXH_ACC_NB); - XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + XXH_ASSERT(lane < XXH_ACC_NB) + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0) { xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); @@ -4965,8 +4965,8 @@ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, { xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ - XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); - XXH_ASSERT(lane < XXH_ACC_NB); + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0) + XXH_ASSERT(lane < XXH_ACC_NB) { xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); xxh_u64 acc64 = xacc[lane]; @@ -5040,7 +5040,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) * Note: in debug mode, this overrides the asm optimization * and Clang will emit MOVK chains again. */ - XXH_ASSERT(kSecretPtr == XXH3_kSecret); + XXH_ASSERT(kSecretPtr == XXH3_kSecret) { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; int i; @@ -5132,7 +5132,7 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, size_t n; - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN) for (n = 0; n < nb_blocks; n++) { f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); @@ -5140,9 +5140,9 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, } /* last partial block */ - XXH_ASSERT(len > XXH_STRIPE_LEN); + XXH_ASSERT(len > XXH_STRIPE_LEN) { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; - XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)) f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); /* last stripe */ @@ -5204,7 +5204,7 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, XXH_STATIC_ASSERT(sizeof(acc) == 64); /* do not align on 8, so that the secret is different from the accumulator */ #define XXH_SECRET_MERGEACCS_START 11 - XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START) return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); } @@ -5287,7 +5287,7 @@ XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, XXH3_hashLong64_f f_hashLong) { - XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN) /* * If an action is to be taken if `secretLen` condition is not respected, * it should be done here. @@ -5363,9 +5363,9 @@ XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH */ static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) { - XXH_ASSERT(align <= 128 && align >= 8); /* range check */ - XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ - XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + XXH_ASSERT(align <= 128 && align >= 8) /* range check */ + XXH_ASSERT((align & (align-1)) == 0) /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)) /* empty/overflow */ { /* Overallocate to make room for manual realignment and an offset byte */ xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); if (base != NULL) { @@ -5379,7 +5379,7 @@ static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) /* Add the offset for the now-aligned pointer */ xxh_u8* ptr = base + offset; - XXH_ASSERT((size_t)ptr % align == 0); + XXH_ASSERT((size_t)ptr % align == 0) /* Store the offset immediately before the returned pointer. */ ptr[-1] = (xxh_u8)offset; @@ -5433,8 +5433,8 @@ XXH3_reset_internal(XXH3_state_t* statePtr, { size_t const initStart = offsetof(XXH3_state_t, bufferedSize); size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; - XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); - XXH_ASSERT(statePtr != NULL); + XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart) + XXH_ASSERT(statePtr != NULL) /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ memset((char*)statePtr + initStart, 0, initLength); statePtr->acc[0] = XXH_PRIME32_3; @@ -5448,7 +5448,7 @@ XXH3_reset_internal(XXH3_state_t* statePtr, statePtr->seed = seed; statePtr->useSeed = (seed != 0); statePtr->extSecret = (const unsigned char*)secret; - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN) statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; } @@ -5508,8 +5508,8 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble) { - XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */ - XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); + XXH_ASSERT(nbStripes <= nbStripesPerBlock) /* can handle max 1 scramble per invocation */ + XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock) if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) { /* need a scrambling operation */ size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr; @@ -5539,11 +5539,11 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state, XXH3_f_scrambleAcc f_scramble) { if (input==NULL) { - XXH_ASSERT(len == 0); + XXH_ASSERT(len == 0) return XXH_OK; } - XXH_ASSERT(state != NULL); + XXH_ASSERT(state != NULL) { const xxh_u8* const bEnd = input + len; const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 @@ -5556,7 +5556,7 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state, xxh_u64* XXH_RESTRICT const acc = state->acc; #endif state->totalLen += len; - XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE) /* small input : just fill in tmp buffer */ if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { @@ -5584,15 +5584,15 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state, f_acc, f_scramble); state->bufferedSize = 0; } - XXH_ASSERT(input < bEnd); + XXH_ASSERT(input < bEnd) /* large input to consume : ingest per full block */ if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) { size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; - XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar); + XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar) /* join to current block's end */ { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar; - XXH_ASSERT(nbStripesToEnd <= nbStripes); + XXH_ASSERT(nbStripesToEnd <= nbStripes) f_acc(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd); f_scramble(acc, secret + state->secretLimit); state->nbStripesSoFar = 0; @@ -5609,11 +5609,11 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state, /* consume last partial block */ f_acc(acc, input, secret, nbStripes); input += nbStripes * XXH_STRIPE_LEN; - XXH_ASSERT(input < bEnd); /* at least some bytes left */ + XXH_ASSERT(input < bEnd) /* at least some bytes left */ state->nbStripesSoFar = nbStripes; /* buffer predecessor of last partial stripe */ XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); - XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN); + XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN) } else { /* content to consume <= block size */ /* Consume input by a multiple of internal buffer size */ @@ -5633,9 +5633,9 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state, } /* Some remaining input (always) : buffer it */ - XXH_ASSERT(input < bEnd); - XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); - XXH_ASSERT(state->bufferedSize == 0); + XXH_ASSERT(input < bEnd) + XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE) + XXH_ASSERT(state->bufferedSize == 0) XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); state->bufferedSize = (XXH32_hash_t)(bEnd-input); #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 @@ -5681,7 +5681,7 @@ XXH3_digest_long (XXH64_hash_t* acc, } else { /* bufferedSize < XXH_STRIPE_LEN */ xxh_u8 lastStripe[XXH_STRIPE_LEN]; size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; - XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ + XXH_ASSERT(state->bufferedSize > 0) /* there is always some input buffered */ XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); XXH3_accumulate_512(acc, @@ -5731,9 +5731,9 @@ XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { /* A doubled version of 1to3_64b with different constants. */ - XXH_ASSERT(input != NULL); - XXH_ASSERT(1 <= len && len <= 3); - XXH_ASSERT(secret != NULL); + XXH_ASSERT(input != NULL) + XXH_ASSERT(1 <= len && len <= 3) + XXH_ASSERT(secret != NULL) /* * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } @@ -5759,9 +5759,9 @@ XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_ XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(4 <= len && len <= 8); + XXH_ASSERT(input != NULL) + XXH_ASSERT(secret != NULL) + XXH_ASSERT(4 <= len && len <= 8) seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; { xxh_u32 const input_lo = XXH_readLE32(input); xxh_u32 const input_hi = XXH_readLE32(input + len - 4); @@ -5786,9 +5786,9 @@ XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_ XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(9 <= len && len <= 16); + XXH_ASSERT(input != NULL) + XXH_ASSERT(secret != NULL) + XXH_ASSERT(9 <= len && len <= 16) { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; xxh_u64 const input_lo = XXH_readLE64(input); @@ -5861,7 +5861,7 @@ XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64 XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { - XXH_ASSERT(len <= 16); + XXH_ASSERT(len <= 16) { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); if (len) return XXH3_len_1to3_128b(input, len, secret, seed); @@ -5894,8 +5894,8 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) { - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(16 < len && len <= 128); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN) (void)secretSize; + XXH_ASSERT(16 < len && len <= 128) { XXH128_hash_t acc; acc.low64 = len * XXH_PRIME64_1; @@ -5938,8 +5938,8 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) { - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN) (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX) { XXH128_hash_t acc; unsigned i; @@ -6003,7 +6003,7 @@ XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); - XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START) { XXH128_hash_t h128; h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, @@ -6081,7 +6081,7 @@ XXH3_128bits_internal(const void* input, size_t len, XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, XXH3_hashLong128_f f_hl128) { - XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN) /* * If an action is to be taken if `secret` conditions are not respected, * it should be done here. @@ -6193,7 +6193,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_ if (state->totalLen > XXH3_MIDSIZE_MAX) { XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; XXH3_digest_long(acc, state, secret); - XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START) { XXH128_hash_t h128; h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, @@ -6283,8 +6283,8 @@ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) { #if (XXH_DEBUGLEVEL >= 1) - XXH_ASSERT(secretBuffer != NULL); - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + XXH_ASSERT(secretBuffer != NULL) + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN) #else /* production mode, assert() are disabled */ if (secretBuffer == NULL) return XXH_ERROR; @@ -6296,7 +6296,7 @@ XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOES customSeedSize = XXH_SECRET_DEFAULT_SIZE; } #if (XXH_DEBUGLEVEL >= 1) - XXH_ASSERT(customSeed != NULL); + XXH_ASSERT(customSeed != NULL) #else if (customSeed == NULL) return XXH_ERROR; #endif @@ -6329,7 +6329,7 @@ XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed) { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; XXH3_initCustomSecret(secret, seed); - XXH_ASSERT(secretBuffer != NULL); + XXH_ASSERT(secretBuffer != NULL) memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE); } diff --git a/utilities/agg_merge/agg_merge.cc b/utilities/agg_merge/agg_merge.cc index 8e5c536f559..a13d861e841 100644 --- a/utilities/agg_merge/agg_merge.cc +++ b/utilities/agg_merge/agg_merge.cc @@ -5,8 +5,7 @@ #include "rocksdb/utilities/agg_merge.h" -#include - +#include #include #include #include @@ -24,7 +23,7 @@ namespace ROCKSDB_NAMESPACE { static std::unordered_map> func_map; -const std::string kUnnamedFuncName = ""; +const std::string kUnnamedFuncName; const std::string kErrorFuncName = "kErrorFuncName"; Status AddAggregator(const std::string& function_name, @@ -37,7 +36,7 @@ Status AddAggregator(const std::string& function_name, return Status::OK(); } -AggMergeOperator::AggMergeOperator() {} +AggMergeOperator::AggMergeOperator() = default; std::string EncodeAggFuncAndPayloadNoCheck(const Slice& function_name, const Slice& value) { @@ -123,7 +122,7 @@ class AggMergeOperator::Accumulator { } std::swap(scratch_, aggregated_); values_.clear(); - values_.push_back(aggregated_); + values_.emplace_back(aggregated_); func_ = my_func; } values_.push_back(my_value); diff --git a/utilities/agg_merge/agg_merge_impl.h b/utilities/agg_merge/agg_merge_impl.h index 00e58de0885..73aa35e49ef 100644 --- a/utilities/agg_merge/agg_merge_impl.h +++ b/utilities/agg_merge/agg_merge_impl.h @@ -44,6 +44,6 @@ class AggMergeOperator : public MergeOperator { static Accumulator& GetTLSAccumulator(); }; -extern std::string EncodeAggFuncAndPayloadNoCheck(const Slice& function_name, - const Slice& value); +std::string EncodeAggFuncAndPayloadNoCheck(const Slice& function_name, + const Slice& value); } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/agg_merge/test_agg_merge.cc b/utilities/agg_merge/test_agg_merge.cc index 63b89cccd69..03bb2a2cd60 100644 --- a/utilities/agg_merge/test_agg_merge.cc +++ b/utilities/agg_merge/test_agg_merge.cc @@ -5,8 +5,7 @@ #include "test_agg_merge.h" -#include - +#include #include #include diff --git a/utilities/backup/backup_engine.cc b/utilities/backup/backup_engine.cc index 31a7337315f..ba46e04e02a 100644 --- a/utilities/backup/backup_engine.cc +++ b/utilities/backup/backup_engine.cc @@ -384,7 +384,7 @@ class BackupEngineImpl { BackupMeta(const BackupMeta&) = delete; BackupMeta& operator=(const BackupMeta&) = delete; - ~BackupMeta() {} + ~BackupMeta() = default; void RecordTimestamp() { // Best effort @@ -639,11 +639,9 @@ class BackupEngineImpl { std::string db_session_id; CopyOrCreateWorkItem() - : src_path(""), - dst_path(""), - src_temperature(Temperature::kUnknown), + : src_temperature(Temperature::kUnknown), dst_temperature(Temperature::kUnknown), - contents(""), + src_env(nullptr), dst_env(nullptr), src_env_options(), @@ -651,10 +649,7 @@ class BackupEngineImpl { rate_limiter(nullptr), size_limit(0), stats(nullptr), - src_checksum_func_name(kUnknownFileChecksumFuncName), - src_checksum_hex(""), - db_id(""), - db_session_id("") {} + src_checksum_func_name(kUnknownFileChecksumFuncName) {} CopyOrCreateWorkItem(const CopyOrCreateWorkItem&) = delete; CopyOrCreateWorkItem& operator=(const CopyOrCreateWorkItem&) = delete; @@ -727,12 +722,7 @@ class BackupEngineImpl { std::string dst_path; std::string dst_relative; BackupAfterCopyOrCreateWorkItem() - : shared(false), - needed_to_copy(false), - backup_env(nullptr), - dst_path_tmp(""), - dst_path(""), - dst_relative("") {} + : shared(false), needed_to_copy(false), backup_env(nullptr) {} BackupAfterCopyOrCreateWorkItem( BackupAfterCopyOrCreateWorkItem&& o) noexcept { @@ -773,7 +763,7 @@ class BackupEngineImpl { std::string from_file; std::string to_file; std::string checksum_hex; - RestoreAfterCopyOrCreateWorkItem() : checksum_hex("") {} + RestoreAfterCopyOrCreateWorkItem() {} RestoreAfterCopyOrCreateWorkItem(std::future&& _result, const std::string& _from_file, const std::string& _to_file, @@ -874,7 +864,7 @@ class BackupEngineImplThreadSafe : public BackupEngine, BackupEngineImplThreadSafe(const BackupEngineOptions& options, Env* db_env, bool read_only = false) : impl_(options, db_env, read_only) {} - ~BackupEngineImplThreadSafe() override {} + ~BackupEngineImplThreadSafe() override = default; using BackupEngine::CreateNewBackupWithMetadata; IOStatus CreateNewBackupWithMetadata(const CreateBackupOptions& options, @@ -1556,11 +1546,10 @@ IOStatus BackupEngineImpl::CreateNewBackupWithMetadata( } ROCKS_LOG_INFO(options_.info_log, "dispatch files for backup done, wait for finish."); - IOStatus item_io_status; for (auto& item : backup_items_to_finish) { item.result.wait(); auto result = item.result.get(); - item_io_status = result.io_status; + IOStatus item_io_status = result.io_status; Temperature temp = result.expected_src_temperature; if (result.current_src_temperature != Temperature::kUnknown && (temp == Temperature::kUnknown || @@ -1577,13 +1566,14 @@ IOStatus BackupEngineImpl::CreateNewBackupWithMetadata( result.db_session_id, temp)); } if (!item_io_status.ok()) { - io_s = item_io_status; + io_s = std::move(item_io_status); + io_s.MustCheck(); } } // we copied all the files, enable file deletions if (disabled.ok()) { // If we successfully disabled file deletions - db->EnableFileDeletions(/*force=*/false).PermitUncheckedError(); + db->EnableFileDeletions().PermitUncheckedError(); } auto backup_time = backup_env_->NowMicros() - start_backup; @@ -2195,6 +2185,7 @@ IOStatus BackupEngineImpl::CopyOrCreateFile( rate_limiter ? static_cast(rate_limiter->GetSingleBurstBytes()) : kDefaultCopyFileBufferSize; + // TODO: pass in Histograms if the destination file is sst or blob std::unique_ptr dest_writer( new WritableFileWriter(std::move(dst_file), dst, dst_file_options)); std::unique_ptr src_reader; @@ -2209,6 +2200,7 @@ IOStatus BackupEngineImpl::CopyOrCreateFile( } Slice data; + const IOOptions opts; do { if (stop_backup_.load(std::memory_order_acquire)) { return status_to_io_status(Status::Incomplete("Backup stopped")); @@ -2238,7 +2230,8 @@ IOStatus BackupEngineImpl::CopyOrCreateFile( if (checksum_hex != nullptr) { checksum_value = crc32c::Extend(checksum_value, data.data(), data.size()); } - io_s = dest_writer->Append(data); + + io_s = dest_writer->Append(opts, data); if (rate_limiter != nullptr) { if (!src.empty()) { @@ -2275,10 +2268,10 @@ IOStatus BackupEngineImpl::CopyOrCreateFile( } if (io_s.ok() && sync) { - io_s = dest_writer->Sync(false); + io_s = dest_writer->Sync(opts, false); } if (io_s.ok()) { - io_s = dest_writer->Close(); + io_s = dest_writer->Close(opts); } return io_s; } @@ -2329,11 +2322,20 @@ IOStatus BackupEngineImpl::AddBackupFileWorkItem( if (GetNamingNoFlags() != BackupEngineOptions::kLegacyCrc32cAndFileSize && file_type != kBlobFile) { // Prepare db_session_id to add to the file name - // Ignore the returned status - // In the failed cases, db_id and db_session_id will be empty - GetFileDbIdentities(db_env_, src_env_options, src_path, src_temperature, - rate_limiter, &db_id, &db_session_id) - .PermitUncheckedError(); + Status s = GetFileDbIdentities(db_env_, src_env_options, src_path, + src_temperature, rate_limiter, &db_id, + &db_session_id); + if (s.IsPathNotFound()) { + // Retry with any temperature + s = GetFileDbIdentities(db_env_, src_env_options, src_path, + Temperature::kUnknown, rate_limiter, &db_id, + &db_session_id); + } + if (s.IsNotFound()) { + // db_id and db_session_id will be empty, which is OK for old files + } else if (!s.ok()) { + return status_to_io_status(std::move(s)); + } } // Calculate checksum if checksum and db session id are not available. // If db session id is available, we will not calculate the checksum @@ -2591,7 +2593,7 @@ Status BackupEngineImpl::GetFileDbIdentities( SstFileDumper sst_reader(options, file_path, file_temp, 2 * 1024 * 1024 /* readahead_size */, - false /* verify_checksum */, false /* output_hex */, + true /* verify_checksum */, false /* output_hex */, false /* decode_blob_index */, src_env_options, true /* silent */); @@ -2602,6 +2604,7 @@ Status BackupEngineImpl::GetFileDbIdentities( if (s.ok()) { // Try to get table properties from the table reader of sst_reader if (!sst_reader.ReadTableProperties(&tp).ok()) { + // FIXME (peterd): this logic is untested and seems obsolete. // Try to use table properites from the initialization of sst_reader table_properties = sst_reader.GetInitTableProperties(); } else { @@ -3352,4 +3355,3 @@ void TEST_SetDefaultRateLimitersClock( restore_rate_limiter_clock); } } // namespace ROCKSDB_NAMESPACE - diff --git a/utilities/backup/backup_engine_test.cc b/utilities/backup/backup_engine_test.cc index 5ed6ae89513..917effceab8 100644 --- a/utilities/backup/backup_engine_test.cc +++ b/utilities/backup/backup_engine_test.cc @@ -86,7 +86,7 @@ class DummyDB : public StackableDB { DBOptions GetDBOptions() const override { return DBOptions(options_); } - Status EnableFileDeletions(bool /*force*/) override { + Status EnableFileDeletions() override { EXPECT_TRUE(!deletions_enabled_); deletions_enabled_ = true; return Status::OK(); @@ -181,6 +181,40 @@ class TestFs : public FileSystemWrapper { bool fail_reads_; }; + class CheckIOOptsSequentialFile : public FSSequentialFileOwnerWrapper { + public: + CheckIOOptsSequentialFile(std::unique_ptr&& f, + const std::string& file_name) + : FSSequentialFileOwnerWrapper(std::move(f)) { + is_sst_file_ = file_name.find(".sst") != std::string::npos; + } + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override { + // Backup currently associates only SST read with rate limiter priority + assert(!is_sst_file_ || options.rate_limiter_priority == + kExpectedBackupReadRateLimiterPri); + IOStatus rv = target()->Read(n, options, result, scratch, dbg); + return rv; + } + + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override { + // Backup currently associates only SST read with rate limiter priority + assert(!is_sst_file_ || options.rate_limiter_priority == + kExpectedBackupReadRateLimiterPri); + IOStatus rv = + target()->PositionedRead(offset, n, options, result, scratch, dbg); + return rv; + } + + private: + static const Env::IOPriority kExpectedBackupReadRateLimiterPri = + Env::IO_LOW; + bool is_sst_file_; + }; + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, std::unique_ptr* r, IODebugContext* dbg) override { @@ -189,6 +223,14 @@ class TestFs : public FileSystemWrapper { r->reset( new TestFs::DummySequentialFile(dummy_sequential_file_fail_reads_)); return IOStatus::OK(); + } else if (check_iooptions_sequential_file_) { + std::unique_ptr file; + IOStatus s = + FileSystemWrapper::NewSequentialFile(f, file_opts, &file, dbg); + if (s.ok()) { + r->reset(new TestFs::CheckIOOptsSequentialFile(std::move(file), f)); + } + return s; } else { IOStatus s = FileSystemWrapper::NewSequentialFile(f, file_opts, r, dbg); if (s.ok()) { @@ -292,6 +334,11 @@ class TestFs : public FileSystemWrapper { dummy_sequential_file_fail_reads_ = dummy_sequential_file_fail_reads; } + void SetCheckIOOptionsSequentialFile(bool check_iooptions_sequential_file) { + MutexLock l(&mutex_); + check_iooptions_sequential_file_ = check_iooptions_sequential_file; + } + void SetGetChildrenFailure(bool fail) { get_children_failure_ = fail; } IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, std::vector* r, @@ -387,6 +434,7 @@ class TestFs : public FileSystemWrapper { port::Mutex mutex_; bool dummy_sequential_file_ = false; bool dummy_sequential_file_fail_reads_ = false; + bool check_iooptions_sequential_file_ = false; std::vector written_files_; std::vector filenames_for_mocked_attrs_; uint64_t limit_written_files_ = 1000000; @@ -499,6 +547,24 @@ class FileManager : public EnvWrapper { return WriteToFile(fname, file_contents); } + Status CorruptFileMiddle(const std::string& fname) { + std::string to_xor = "blah"; + std::string file_contents; + Status s = ReadFileToString(this, fname, &file_contents); + if (!s.ok()) { + return s; + } + s = DeleteFile(fname); + if (!s.ok()) { + return s; + } + size_t middle = file_contents.size() / 2; + for (size_t i = 0; i < to_xor.size(); ++i) { + file_contents[middle + i] ^= to_xor[i]; + } + return WriteToFile(fname, file_contents); + } + Status CorruptChecksum(const std::string& fname, bool appear_valid) { std::string metadata; Status s = ReadFileToString(this, fname, &metadata); @@ -792,8 +858,8 @@ class BackupEngineTest : public testing::Test { for (auto& dir : child_dirs) { dir = "private/" + dir; } - child_dirs.push_back("shared"); // might not exist - child_dirs.push_back("shared_checksum"); // might not exist + child_dirs.emplace_back("shared"); // might not exist + child_dirs.emplace_back("shared_checksum"); // might not exist for (auto& dir : child_dirs) { std::vector children; test_backup_env_->GetChildren(backupdir_ + "/" + dir, &children) @@ -861,7 +927,7 @@ class BackupEngineTest : public testing::Test { void DeleteLogFiles() { std::vector delete_logs; ASSERT_OK(db_chroot_env_->GetChildren(dbname_, &delete_logs)); - for (auto f : delete_logs) { + for (const auto& f : delete_logs) { uint64_t number; FileType type; bool ok = ParseFileName(f, &number, &type); @@ -931,7 +997,7 @@ class BackupEngineTest : public testing::Test { } file_contents[0] = (file_contents[0] + 257) % 256; - return WriteStringToFile(test_db_env_.get(), file_contents, fname); + return WriteStringToFile(test_db_env_.get(), file_contents, fname, false); } void AssertDirectoryFilesMatchRegex(const std::string& dir, @@ -1166,7 +1232,8 @@ TEST_P(BackupEngineTestWithParam, OnlineIntegrationTest) { // restore) // options_.db_paths.emplace_back(dbname_, 500 * 1024); // options_.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024); - + test_db_fs_->SetCheckIOOptionsSequentialFile(true); + test_backup_fs_->SetCheckIOOptionsSequentialFile(true); OpenDBAndBackupEngine(true); // write some data, backup, repeat for (int i = 0; i < 5; ++i) { @@ -1223,6 +1290,8 @@ TEST_P(BackupEngineTestWithParam, OnlineIntegrationTest) { AssertBackupConsistency(0, 0, 3 * keys_iteration, max_key); CloseBackupEngine(); + test_db_fs_->SetCheckIOOptionsSequentialFile(false); + test_backup_fs_->SetCheckIOOptionsSequentialFile(false); } #endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) @@ -1692,7 +1761,7 @@ TEST_F(BackupEngineTest, TableFileWithoutDbChecksumCorruptedDuringBackup) { "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup", [&](void* data) { if (data != nullptr) { - Slice* d = reinterpret_cast(data); + Slice* d = static_cast(data); if (!d->empty()) { d->remove_suffix(1); corrupted = true; @@ -1734,7 +1803,7 @@ TEST_F(BackupEngineTest, TableFileWithDbChecksumCorruptedDuringBackup) { "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup", [&](void* data) { if (data != nullptr) { - Slice* d = reinterpret_cast(data); + Slice* d = static_cast(data); if (!d->empty()) { d->remove_suffix(1); } @@ -1856,7 +1925,7 @@ TEST_F(BackupEngineTest, BackupOptions) { ASSERT_OK(file_manager_->FileExists(OptionsPath(backupdir_, i) + name)); ASSERT_OK(backup_chroot_env_->GetChildren(OptionsPath(backupdir_, i), &filenames)); - for (auto fn : filenames) { + for (const auto& fn : filenames) { if (fn.compare(0, 7, "OPTIONS") == 0) { ASSERT_EQ(name, fn); } @@ -2235,6 +2304,33 @@ TEST_F(BackupEngineTest, TableFileCorruptionBeforeIncremental) { } } +TEST_F(BackupEngineTest, PropertiesBlockCorruptionIncremental) { + OpenDBAndBackupEngine(true, false, kShareWithChecksum); + DBImpl* dbi = static_cast(db_.get()); + // A small SST file + ASSERT_OK(dbi->Put(WriteOptions(), "x", "y")); + ASSERT_OK(dbi->Flush(FlushOptions())); + ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); + + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + + CloseBackupEngine(); + + std::vector table_files; + ASSERT_OK(GetDataFilesInDB(kTableFile, &table_files)); + ASSERT_EQ(table_files.size(), 1); + std::string tf = dbname_ + "/" + table_files[0].name; + // Properties block should be in the middle of a small file + ASSERT_OK(db_file_manager_->CorruptFileMiddle(tf)); + + OpenBackupEngine(); + + Status s = backup_engine_->CreateNewBackup(db_.get()); + ASSERT_TRUE(s.IsCorruption()); + + CloseDBAndBackupEngine(); +} + // Test how naming options interact with detecting file size corruption // between incremental backups TEST_F(BackupEngineTest, FileSizeForIncremental) { @@ -2568,7 +2664,7 @@ TEST_F(BackupEngineTest, DeleteTmpFiles) { assert(false); } CloseDBAndBackupEngine(); - for (std::string file_or_dir : tmp_files_and_dirs) { + for (const std::string& file_or_dir : tmp_files_and_dirs) { if (file_manager_->FileExists(file_or_dir) != Status::NotFound()) { FAIL() << file_or_dir << " was expected to be deleted." << cleanup_fn; } @@ -2602,7 +2698,7 @@ class BackupEngineRateLimitingTestWithParam int /* 0 = single threaded, 1 = multi threaded*/, std::pair /* limits */>> { public: - BackupEngineRateLimitingTestWithParam() {} + BackupEngineRateLimitingTestWithParam() = default; }; uint64_t const MB = 1024 * 1024; @@ -2638,12 +2734,14 @@ TEST_P(BackupEngineRateLimitingTestWithParam, RateLimiting) { std::make_shared( limit.first, 100 * 1000 /* refill_period_us */, 10 /* fairness */, RateLimiter::Mode::kWritesOnly /* mode */, - special_env->GetSystemClock(), false /* auto_tuned */); + special_env->GetSystemClock(), false /* auto_tuned */, + 0 /* single_burst_bytes */); std::shared_ptr restore_rate_limiter = std::make_shared( limit.second, 100 * 1000 /* refill_period_us */, 10 /* fairness */, RateLimiter::Mode::kWritesOnly /* mode */, - special_env->GetSystemClock(), false /* auto_tuned */); + special_env->GetSystemClock(), false /* auto_tuned */, + 0 /* single_burst_bytes */); engine_options_->backup_rate_limiter = backup_rate_limiter; engine_options_->restore_rate_limiter = restore_rate_limiter; } else { @@ -2711,7 +2809,8 @@ TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingVerifyBackup) { std::make_shared( backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */, 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */, - special_env->GetSystemClock(), false /* auto_tuned */); + special_env->GetSystemClock(), false /* auto_tuned */, + 0 /* single_burst_bytes */); engine_options_->backup_rate_limiter = backup_rate_limiter; } else { engine_options_->backup_rate_limit = backup_rate_limiter_limit; @@ -2749,7 +2848,7 @@ TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingVerifyBackup) { true /* include_file_details */)); std::uint64_t bytes_read_during_verify_backup = 0; - for (BackupFileInfo backup_file_info : backup_info.file_details) { + for (const BackupFileInfo& backup_file_info : backup_info.file_details) { bytes_read_during_verify_backup += backup_file_info.size; } auto start_verify_backup = special_env->NowMicros(); @@ -2887,7 +2986,7 @@ class BackupEngineRateLimitingTestWithParam2 public testing::WithParamInterface< std::tuple /* limits */>> { public: - BackupEngineRateLimitingTestWithParam2() {} + BackupEngineRateLimitingTestWithParam2() = default; }; INSTANTIATE_TEST_CASE_P( @@ -2907,7 +3006,8 @@ TEST_P(BackupEngineRateLimitingTestWithParam2, std::make_shared( backup_rate_limiter_limit, 1000 * 1000 /* refill_period_us */, 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */, - special_env.GetSystemClock(), false /* auto_tuned */)); + special_env.GetSystemClock(), false /* auto_tuned */, + 0 /* single_burst_bytes */)); engine_options_->backup_rate_limiter = backup_rate_limiter; @@ -2916,7 +3016,8 @@ TEST_P(BackupEngineRateLimitingTestWithParam2, std::make_shared( restore_rate_limiter_limit, 1000 * 1000 /* refill_period_us */, 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */, - special_env.GetSystemClock(), false /* auto_tuned */)); + special_env.GetSystemClock(), false /* auto_tuned */, + 0 /* single_burst_bytes */)); engine_options_->restore_rate_limiter = restore_rate_limiter; @@ -3903,7 +4004,7 @@ TEST_F(BackupEngineTest, BackgroundThreadCpuPriority) { std::atomic priority(CpuPriority::kNormal); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "BackupEngineImpl::Initialize:SetCpuPriority", [&](void* new_priority) { - priority.store(*reinterpret_cast(new_priority)); + priority.store(*static_cast(new_priority)); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -4083,7 +4184,7 @@ TEST_F(BackupEngineTest, FileTemperatures) { SetEnvsFromFileSystems(); // Use temperatures - options_.bottommost_temperature = Temperature::kWarm; + options_.last_level_temperature = Temperature::kWarm; options_.level0_file_num_compaction_trigger = 2; // set dynamic_level to true so the compaction would compact the data to the // last level directly which will have the last_level_temperature @@ -4111,7 +4212,7 @@ TEST_F(BackupEngineTest, FileTemperatures) { std::vector infos; ASSERT_OK( db_->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), &infos)); - for (auto info : infos) { + for (const auto& info : infos) { if (info.file_type == kTableFile) { manifest_temps.emplace(info.file_number, info.temperature); manifest_temp_counts[info.temperature]++; @@ -4278,7 +4379,7 @@ TEST_F(BackupEngineTest, ExcludeFiles) { MaybeExcludeBackupFile* files_end) { for (auto* f = files_begin; f != files_end; ++f) { std::string s = StringSplit(f->info.relative_file, '/').back(); - s = s.substr(0, s.find("_")); + s = s.substr(0, s.find('_')); int64_t num = std::strtoll(s.c_str(), nullptr, /*base*/ 10); // Exclude if not a match f->exclude_decision = (num % modulus) != remainder; diff --git a/utilities/blob_db/blob_compaction_filter.cc b/utilities/blob_db/blob_compaction_filter.cc index ddaa98c7d32..f22a1694579 100644 --- a/utilities/blob_db/blob_compaction_filter.cc +++ b/utilities/blob_db/blob_compaction_filter.cc @@ -13,8 +13,7 @@ #include "rocksdb/system_clock.h" #include "test_util/sync_point.h" -namespace ROCKSDB_NAMESPACE { -namespace blob_db { +namespace ROCKSDB_NAMESPACE::blob_db { BlobIndexCompactionFilterBase::~BlobIndexCompactionFilterBase() { if (blob_file_) { @@ -181,7 +180,9 @@ bool BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded() const { BlobDBImpl* const blob_db_impl = context_.blob_db_impl; assert(blob_db_impl); + // TODO: plumb Env::IOActivity, Env::IOPriority const Status s = blob_db_impl->CreateBlobFileAndWriter( + WriteOptions(), /* has_ttl */ false, ExpirationRange(), "compaction/GC", &blob_file_, &writer_); if (!s.ok()) { @@ -251,8 +252,9 @@ bool BlobIndexCompactionFilterBase::WriteBlobToNewFile( assert(writer_); uint64_t new_key_offset = 0; - const Status s = writer_->AddRecord(key, blob, kNoExpiration, &new_key_offset, - new_blob_offset); + // TODO: plumb Env::IOActivity, Env::IOPriority + const Status s = writer_->AddRecord(WriteOptions(), key, blob, kNoExpiration, + &new_key_offset, new_blob_offset); if (!s.ok()) { const BlobDBImpl* const blob_db_impl = context_.blob_db_impl; @@ -302,7 +304,8 @@ bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFile() const { { WriteLock wl(&blob_db_impl->mutex_); - s = blob_db_impl->CloseBlobFile(blob_file_); + // TODO: plumb Env::IOActivity, Env::IOPriority + s = blob_db_impl->CloseBlobFile(WriteOptions(), blob_file_); // Note: we delay registering the new blob file until it's closed to // prevent FIFO eviction from processing it during compaction/GC. @@ -484,5 +487,4 @@ BlobIndexCompactionFilterFactoryGC::CreateCompactionFilter( std::move(user_comp_filter_from_factory), current_time, statistics())); } -} // namespace blob_db -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::blob_db diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc index b6fe0390364..25960bdd6c8 100644 --- a/utilities/blob_db/blob_db.cc +++ b/utilities/blob_db/blob_db.cc @@ -11,8 +11,7 @@ #include "logging/logging.h" #include "utilities/blob_db/blob_db_impl.h" -namespace ROCKSDB_NAMESPACE { -namespace blob_db { +namespace ROCKSDB_NAMESPACE::blob_db { Status BlobDB::Open(const Options& options, const BlobDBOptions& bdb_options, const std::string& dbname, BlobDB** blob_db) { @@ -20,8 +19,7 @@ Status BlobDB::Open(const Options& options, const BlobDBOptions& bdb_options, DBOptions db_options(options); ColumnFamilyOptions cf_options(options); std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); std::vector handles; Status s = BlobDB::Open(db_options, bdb_options, dbname, column_families, &handles, blob_db); @@ -108,5 +106,4 @@ void BlobDBOptions::Dump(Logger* log) const { disable_background_tasks); } -} // namespace blob_db -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::blob_db diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h index e2f0b7bdbdd..d7eaf0fae22 100644 --- a/utilities/blob_db/blob_db.h +++ b/utilities/blob_db/blob_db.h @@ -88,11 +88,10 @@ struct BlobDBOptions { class BlobDB : public StackableDB { public: using ROCKSDB_NAMESPACE::StackableDB::Put; - virtual Status Put(const WriteOptions& options, const Slice& key, - const Slice& value) override = 0; - virtual Status Put(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override { + Status Put(const WriteOptions& options, const Slice& key, + const Slice& value) override = 0; + Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) override { if (column_family->GetID() != DefaultColumnFamily()->GetID()) { return Status::NotSupported( "Blob DB doesn't support non-default column family."); @@ -101,9 +100,8 @@ class BlobDB : public StackableDB { } using ROCKSDB_NAMESPACE::StackableDB::Delete; - virtual Status Delete(const WriteOptions& options, - ColumnFamilyHandle* column_family, - const Slice& key) override { + Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key) override { if (column_family->GetID() != DefaultColumnFamily()->GetID()) { return Status::NotSupported( "Blob DB doesn't support non-default column family."); @@ -139,9 +137,9 @@ class BlobDB : public StackableDB { } using ROCKSDB_NAMESPACE::StackableDB::Get; - virtual Status Get(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override = 0; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + std::string* timestamp) override = 0; // Get value and expiration. virtual Status Get(const ReadOptions& options, @@ -152,57 +150,26 @@ class BlobDB : public StackableDB { return Get(options, DefaultColumnFamily(), key, value, expiration); } - using ROCKSDB_NAMESPACE::StackableDB::MultiGet; - virtual std::vector MultiGet( - const ReadOptions& options, const std::vector& keys, - std::vector* values) override = 0; - virtual std::vector MultiGet( - const ReadOptions& options, - const std::vector& column_families, - const std::vector& keys, - std::vector* values) override { - for (auto column_family : column_families) { - if (column_family->GetID() != DefaultColumnFamily()->GetID()) { - return std::vector( - column_families.size(), - Status::NotSupported( - "Blob DB doesn't support non-default column family.")); - } - } - return MultiGet(options, keys, values); - } - virtual void MultiGet(const ReadOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const size_t num_keys, const Slice* /*keys*/, - PinnableSlice* /*values*/, Status* statuses, - const bool /*sorted_input*/ = false) override { - for (size_t i = 0; i < num_keys; ++i) { - statuses[i] = - Status::NotSupported("Blob DB doesn't support batched MultiGet"); - } - } - using ROCKSDB_NAMESPACE::StackableDB::SingleDelete; - virtual Status SingleDelete(const WriteOptions& /*wopts*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/) override { + Status SingleDelete(const WriteOptions& /*wopts*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { return Status::NotSupported("Not supported operation in blob db."); } using ROCKSDB_NAMESPACE::StackableDB::Merge; - virtual Status Merge(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/, const Slice& /*value*/) override { + Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { return Status::NotSupported("Not supported operation in blob db."); } - virtual Status Write(const WriteOptions& opts, - WriteBatch* updates) override = 0; + Status Write(const WriteOptions& opts, WriteBatch* updates) override = 0; using ROCKSDB_NAMESPACE::StackableDB::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& options) override = 0; - virtual Iterator* NewIterator(const ReadOptions& options, - ColumnFamilyHandle* column_family) override { + Iterator* NewIterator(const ReadOptions& options) override = 0; + Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override { if (column_family->GetID() != DefaultColumnFamily()->GetID()) { // Blob DB doesn't support non-default column family. return nullptr; @@ -233,7 +200,7 @@ class BlobDB : public StackableDB { } using ROCKSDB_NAMESPACE::StackableDB::Close; - virtual Status Close() override = 0; + Status Close() override = 0; // Opening blob db. static Status Open(const Options& options, const BlobDBOptions& bdb_options, @@ -248,9 +215,9 @@ class BlobDB : public StackableDB { virtual BlobDBOptions GetBlobDBOptions() const = 0; - virtual Status SyncBlobFiles() = 0; + virtual Status SyncBlobFiles(const WriteOptions& write_options) = 0; - virtual ~BlobDB() {} + ~BlobDB() override {} protected: explicit BlobDB(); diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 2fa7ae898f5..c88c3e8a709 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -23,6 +23,7 @@ #include "logging/logging.h" #include "monitoring/instrumented_mutex.h" #include "monitoring/statistics_impl.h" +#include "monitoring/thread_status_util.h" #include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" @@ -47,8 +48,7 @@ namespace { int kBlockBasedTableVersionFormat = 2; } // end namespace -namespace ROCKSDB_NAMESPACE { -namespace blob_db { +namespace ROCKSDB_NAMESPACE::blob_db { bool BlobFileComparator::operator()( const std::shared_ptr& lhs, @@ -106,6 +106,15 @@ BlobDBImpl::~BlobDBImpl() { } Status BlobDBImpl::Close() { + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); + Status s = CloseImpl(); + ThreadStatusUtil::SetThreadOperation(cur_op_type); + return s; +} + +Status BlobDBImpl::CloseImpl() { if (closed_) { return Status::OK(); } @@ -123,7 +132,8 @@ Status BlobDBImpl::Close() { return s; } - s = SyncBlobFiles(); + // TODO: plumb Env::IOActivity, Env::IOPriority + s = SyncBlobFiles(WriteOptions()); return s; } @@ -277,7 +287,7 @@ Status BlobDBImpl::Open(std::vector* handles) { return s; } - UpdateLiveSSTSize(); + UpdateLiveSSTSize(WriteOptions(Env::IOActivity::kDBOpen)); // Start background jobs. if (!bdb_options_.disable_background_tasks) { @@ -743,7 +753,9 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr& bfile) { } std::unique_ptr fwriter; - fwriter.reset(new WritableFileWriter(std::move(wfile), fpath, file_options_)); + fwriter.reset(new WritableFileWriter( + std::move(wfile), fpath, file_options_, clock_, nullptr /* io_tracer */, + statistics_, Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS)); uint64_t boffset = bfile->GetFileSize(); if (debug_level_ >= 2 && boffset) { @@ -824,8 +836,9 @@ Status BlobDBImpl::CheckOrCreateWriterLocked( } Status BlobDBImpl::CreateBlobFileAndWriter( - bool has_ttl, const ExpirationRange& expiration_range, - const std::string& reason, std::shared_ptr* blob_file, + const WriteOptions& write_options, bool has_ttl, + const ExpirationRange& expiration_range, const std::string& reason, + std::shared_ptr* blob_file, std::shared_ptr* writer) { TEST_SYNC_POINT("BlobDBImpl::CreateBlobFileAndWriter"); assert(has_ttl == (expiration_range.first || expiration_range.second)); @@ -846,7 +859,7 @@ Status BlobDBImpl::CreateBlobFileAndWriter( assert(*writer); - s = (*writer)->WriteHeader((*blob_file)->header_); + s = (*writer)->WriteHeader(write_options, (*blob_file)->header_); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "Failed to write header to new blob file: %s" @@ -861,7 +874,8 @@ Status BlobDBImpl::CreateBlobFileAndWriter( return s; } -Status BlobDBImpl::SelectBlobFile(std::shared_ptr* blob_file) { +Status BlobDBImpl::SelectBlobFile(const WriteOptions& write_options, + std::shared_ptr* blob_file) { assert(blob_file); { @@ -885,6 +899,7 @@ Status BlobDBImpl::SelectBlobFile(std::shared_ptr* blob_file) { std::shared_ptr writer; const Status s = CreateBlobFileAndWriter( + write_options, /* has_ttl */ false, ExpirationRange(), /* reason */ "SelectBlobFile", blob_file, &writer); if (!s.ok()) { @@ -897,7 +912,8 @@ Status BlobDBImpl::SelectBlobFile(std::shared_ptr* blob_file) { return s; } -Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration, +Status BlobDBImpl::SelectBlobFileTTL(const WriteOptions& write_options, + uint64_t expiration, std::shared_ptr* blob_file) { assert(blob_file); assert(expiration != kNoExpiration); @@ -930,9 +946,9 @@ Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration, oss << "SelectBlobFileTTL range: [" << exp_low << ',' << exp_high << ')'; std::shared_ptr writer; - const Status s = - CreateBlobFileAndWriter(/* has_ttl */ true, expiration_range, - /* reason */ oss.str(), blob_file, &writer); + const Status s = CreateBlobFileAndWriter( + write_options, /* has_ttl */ true, expiration_range, + /* reason */ oss.str(), blob_file, &writer); if (!s.ok()) { return s; } @@ -1055,7 +1071,7 @@ Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key, return s; } -Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/, +Status BlobDBImpl::PutBlobValue(const WriteOptions& write_options, const Slice& key, const Slice& value, uint64_t expiration, WriteBatch* batch) { write_mutex_.AssertHeld(); @@ -1087,30 +1103,30 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/, // Check DB size limit before selecting blob file to // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be // done before calling SelectBlobFile(). - s = CheckSizeAndEvictBlobFiles(headerbuf.size() + key.size() + - value_compressed.size()); + s = CheckSizeAndEvictBlobFiles( + write_options, headerbuf.size() + key.size() + value_compressed.size()); if (!s.ok()) { return s; } std::shared_ptr blob_file; if (expiration != kNoExpiration) { - s = SelectBlobFileTTL(expiration, &blob_file); + s = SelectBlobFileTTL(write_options, expiration, &blob_file); } else { - s = SelectBlobFile(&blob_file); + s = SelectBlobFile(write_options, &blob_file); } if (s.ok()) { assert(blob_file != nullptr); assert(blob_file->GetCompressionType() == bdb_options_.compression); - s = AppendBlob(blob_file, headerbuf, key, value_compressed, expiration, - &index_entry); + s = AppendBlob(write_options, blob_file, headerbuf, key, value_compressed, + expiration, &index_entry); } if (s.ok()) { if (expiration != kNoExpiration) { WriteLock file_lock(&blob_file->mutex_); blob_file->ExtendExpirationRange(expiration); } - s = CloseBlobFileIfNeeded(blob_file); + s = CloseBlobFileIfNeeded(write_options, blob_file); } if (s.ok()) { s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key, @@ -1249,7 +1265,7 @@ void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context, } } -void BlobDBImpl::UpdateLiveSSTSize() { +void BlobDBImpl::UpdateLiveSSTSize(const WriteOptions& write_options) { uint64_t live_sst_size = 0; bool ok = GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size); if (ok) { @@ -1265,7 +1281,7 @@ void BlobDBImpl::UpdateLiveSSTSize() { { // Trigger FIFO eviction if needed. MutexLock l(&write_mutex_); - Status s = CheckSizeAndEvictBlobFiles(0, true /*force*/); + Status s = CheckSizeAndEvictBlobFiles(write_options, 0, true /*force*/); if (s.IsNoSpace()) { ROCKS_LOG_WARN(db_options_.info_log, "DB grow out-of-space after SST size updated. Current live" @@ -1276,7 +1292,8 @@ void BlobDBImpl::UpdateLiveSSTSize() { } } -Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size, +Status BlobDBImpl::CheckSizeAndEvictBlobFiles(const WriteOptions& write_options, + uint64_t blob_size, bool force_evict) { write_mutex_.AssertHeld(); @@ -1316,7 +1333,7 @@ Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size, } // FIFO eviction can evict open blob files. if (!blob_file->Immutable()) { - Status s = CloseBlobFile(blob_file); + Status s = CloseBlobFile(write_options, blob_file); if (!s.ok()) { return s; } @@ -1347,7 +1364,8 @@ Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size, return Status::OK(); } -Status BlobDBImpl::AppendBlob(const std::shared_ptr& bfile, +Status BlobDBImpl::AppendBlob(const WriteOptions& write_options, + const std::shared_ptr& bfile, const std::string& headerbuf, const Slice& key, const Slice& value, uint64_t expiration, std::string* index_entry) { @@ -1363,8 +1381,8 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr& bfile, } // write the blob to the blob log. - s = writer->EmitPhysicalRecord(headerbuf, key, value, &key_offset, - &blob_offset); + s = writer->EmitPhysicalRecord(write_options, headerbuf, key, value, + &key_offset, &blob_offset); } if (!s.ok()) { @@ -1390,27 +1408,43 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr& bfile, return s; } -std::vector BlobDBImpl::MultiGet(const ReadOptions& _read_options, - const std::vector& keys, - std::vector* values) { +void BlobDBImpl::MultiGet(const ReadOptions& _read_options, size_t num_keys, + ColumnFamilyHandle** column_families, + const Slice* keys, PinnableSlice* values, + std::string* timestamps, Status* statuses, + const bool /*sorted_input*/) { StopWatch multiget_sw(clock_, statistics_, BLOB_DB_MULTIGET_MICROS); RecordTick(statistics_, BLOB_DB_NUM_MULTIGET); // Get a snapshot to avoid blob file get deleted between we // fetch and index entry and reading from the file. - std::vector statuses; - std::size_t num_keys = keys.size(); - statuses.reserve(num_keys); - if (_read_options.io_activity != Env::IOActivity::kUnknown && - _read_options.io_activity != Env::IOActivity::kMultiGet) { - Status s = Status::InvalidArgument( - "Can only call MultiGet with `ReadOptions::io_activity` is " - "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + { + Status s; + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGet) { + s = Status::InvalidArgument( + "Can only call MultiGet with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + } else if (timestamps) { + s = Status::NotSupported( + "MultiGet() returning timestamps not implemented."); + } + if (s.ok()) { + for (size_t i = 0; i < num_keys; ++i) { + if (column_families[i]->GetID() != DefaultColumnFamily()->GetID()) { + s = Status::NotSupported( + "Blob DB doesn't support non-default column family."); + break; + } + } + } - for (size_t i = 0; i < num_keys; ++i) { - statuses.push_back(s); + if (!s.ok()) { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = s; + } + return; } - return statuses; } ReadOptions read_options(_read_options); @@ -1419,19 +1453,13 @@ std::vector BlobDBImpl::MultiGet(const ReadOptions& _read_options, } bool snapshot_created = SetSnapshotIfNeeded(&read_options); - values->clear(); - values->reserve(keys.size()); - PinnableSlice value; - for (size_t i = 0; i < keys.size(); i++) { - statuses.push_back( - GetImpl(read_options, DefaultColumnFamily(), keys[i], &value)); - values->push_back(value.ToString()); - value.Reset(); + for (size_t i = 0; i < num_keys; i++) { + PinnableSlice& value = values[i]; + statuses[i] = GetImpl(read_options, DefaultColumnFamily(), keys[i], &value); } if (snapshot_created) { db_->ReleaseSnapshot(read_options.snapshot); } - return statuses; } bool BlobDBImpl::SetSnapshotIfNeeded(ReadOptions* read_options) { @@ -1572,8 +1600,8 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number, } else { buf.reserve(static_cast(record_size)); s = reader->Read(IOOptions(), record_offset, - static_cast(record_size), &blob_record, &buf[0], - nullptr); + static_cast(record_size), &blob_record, + buf.data(), nullptr); } RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size()); } @@ -1635,13 +1663,18 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number, Status BlobDBImpl::Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) { + PinnableSlice* value, std::string* timestamp) { if (_read_options.io_activity != Env::IOActivity::kUnknown && _read_options.io_activity != Env::IOActivity::kGet) { return Status::InvalidArgument( "Can only call Get with `ReadOptions::io_activity` is " "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`"); } + if (timestamp) { + return Status::NotSupported( + "Get() that returns timestamp is not implemented."); + } + ReadOptions read_options(_read_options); if (read_options.io_activity == Env::IOActivity::kUnknown) { read_options.io_activity = Env::IOActivity::kGet; @@ -1735,7 +1768,7 @@ std::pair BlobDBImpl::SanityCheck(bool aborted) { uint64_t now = EpochNow(); - for (auto blob_file_pair : blob_files_) { + for (const auto& blob_file_pair : blob_files_) { auto blob_file = blob_file_pair.second; std::ostringstream buf; @@ -1767,7 +1800,8 @@ std::pair BlobDBImpl::SanityCheck(bool aborted) { return std::make_pair(true, -1); } -Status BlobDBImpl::CloseBlobFile(std::shared_ptr bfile) { +Status BlobDBImpl::CloseBlobFile(const WriteOptions& write_options, + std::shared_ptr bfile) { TEST_SYNC_POINT("BlobDBImpl::CloseBlobFile"); assert(bfile); assert(!bfile->Immutable()); @@ -1783,7 +1817,7 @@ Status BlobDBImpl::CloseBlobFile(std::shared_ptr bfile) { const SequenceNumber sequence = GetLatestSequenceNumber(); - const Status s = bfile->WriteFooterAndCloseLocked(sequence); + const Status s = bfile->WriteFooterAndCloseLocked(write_options, sequence); if (s.ok()) { total_blob_size_ += BlobLogFooter::kSize; @@ -1815,7 +1849,8 @@ Status BlobDBImpl::CloseBlobFile(std::shared_ptr bfile) { return s; } -Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr& bfile) { +Status BlobDBImpl::CloseBlobFileIfNeeded(const WriteOptions& write_options, + std::shared_ptr& bfile) { write_mutex_.AssertHeld(); // atomic read @@ -1831,7 +1866,7 @@ Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr& bfile) { return Status::OK(); } - return CloseBlobFile(bfile); + return CloseBlobFile(write_options, bfile); } void BlobDBImpl::ObsoleteBlobFile(std::shared_ptr blob_file, @@ -1893,7 +1928,7 @@ std::pair BlobDBImpl::EvictExpiredFiles(bool aborted) { uint64_t now = EpochNow(); { ReadLock rl(&mutex_); - for (auto p : blob_files_) { + for (const auto& p : blob_files_) { auto& blob_file = p.second; ReadLock file_lock(&blob_file->mutex_); if (blob_file->HasTTL() && !blob_file->Obsolete() && @@ -1921,7 +1956,8 @@ std::pair BlobDBImpl::EvictExpiredFiles(bool aborted) { } if (!blob_file->Immutable()) { - CloseBlobFile(blob_file).PermitUncheckedError(); + // TODO: plumb Env::IOActivity, Env::IOPriority + CloseBlobFile(WriteOptions(), blob_file).PermitUncheckedError(); } assert(blob_file->Immutable()); @@ -1933,13 +1969,13 @@ std::pair BlobDBImpl::EvictExpiredFiles(bool aborted) { return std::make_pair(true, -1); } -Status BlobDBImpl::SyncBlobFiles() { +Status BlobDBImpl::SyncBlobFiles(const WriteOptions& write_options) { MutexLock l(&write_mutex_); std::vector> process_files; { ReadLock rl(&mutex_); - for (auto fitr : open_ttl_files_) { + for (const auto& fitr : open_ttl_files_) { process_files.push_back(fitr); } if (open_non_ttl_file_ != nullptr) { @@ -1949,7 +1985,7 @@ Status BlobDBImpl::SyncBlobFiles() { Status s; for (auto& blob_file : process_files) { - s = blob_file->Fsync(); + s = blob_file->Fsync(write_options); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "Failed to sync blob file %" PRIu64 ", status: %s", @@ -1968,7 +2004,9 @@ Status BlobDBImpl::SyncBlobFiles() { } std::pair BlobDBImpl::ReclaimOpenFiles(bool aborted) { - if (aborted) return std::make_pair(false, -1); + if (aborted) { + return std::make_pair(false, -1); + } if (open_file_count_.load() < kOpenFilesTrigger) { return std::make_pair(true, -1); @@ -1979,7 +2017,9 @@ std::pair BlobDBImpl::ReclaimOpenFiles(bool aborted) { ReadLock rl(&mutex_); for (auto const& ent : blob_files_) { auto bfile = ent.second; - if (bfile->last_access_.load() == -1) continue; + if (bfile->last_access_.load() == -1) { + continue; + } WriteLock lockbfile_w(&bfile->mutex_); CloseRandomAccessLocked(bfile); @@ -2062,7 +2102,7 @@ std::pair BlobDBImpl::DeleteObsoleteFiles(bool aborted) { // put files back into obsolete if for some reason, delete failed if (!tobsolete.empty()) { WriteLock wl(&mutex_); - for (auto bfile : tobsolete) { + for (const auto& bfile : tobsolete) { blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile)); obsolete_files_.push_front(bfile); } @@ -2090,9 +2130,9 @@ Iterator* BlobDBImpl::NewIterator(const ReadOptions& _read_options) { if (read_options.io_activity == Env::IOActivity::kUnknown) { read_options.io_activity = Env::IOActivity::kDBIterator; } - auto* cfd = - static_cast_with_check(DefaultColumnFamily()) - ->cfd(); + auto* cfh = + static_cast_with_check(DefaultColumnFamily()); + auto* cfd = cfh->cfd(); // Get a snapshot to avoid blob file get deleted between we // fetch and index entry and reading from the file. ManagedSnapshot* own_snapshot = nullptr; @@ -2103,7 +2143,7 @@ Iterator* BlobDBImpl::NewIterator(const ReadOptions& _read_options) { } SuperVersion* sv = cfd->GetReferencedSuperVersion(db_impl_); auto* iter = db_impl_->NewIteratorImpl( - read_options, cfd, sv, snapshot->GetSequenceNumber(), + read_options, cfh, sv, snapshot->GetSequenceNumber(), nullptr /*read_callback*/, true /*expose_blob_index*/); return new BlobDBIterator(own_snapshot, iter, this, clock_, statistics_); } @@ -2196,7 +2236,7 @@ Status BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr& bfile) { WriteLock lock(&mutex_); WriteLock file_lock(&bfile->mutex_); - return CloseBlobFile(bfile); + return CloseBlobFile(WriteOptions(), bfile); } void BlobDBImpl::TEST_ObsoleteBlobFile(std::shared_ptr& blob_file, @@ -2226,5 +2266,4 @@ void BlobDBImpl::TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info) { #endif // !NDEBUG -} // namespace blob_db -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::blob_db diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h index d491108d3e6..6cbc0d594a5 100644 --- a/utilities/blob_db/blob_db_impl.h +++ b/utilities/blob_db/blob_db_impl.h @@ -105,17 +105,17 @@ class BlobDBImpl : public BlobDB { using BlobDB::Get; Status Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override; + PinnableSlice* value, std::string* timestamp) override; Status Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value, uint64_t* expiration) override; using BlobDB::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& read_options) override; + Iterator* NewIterator(const ReadOptions& read_options) override; using BlobDB::NewIterators; - virtual Status NewIterators( + Status NewIterators( const ReadOptions& /*read_options*/, const std::vector& /*column_families*/, std::vector* /*iterators*/) override { @@ -123,14 +123,15 @@ class BlobDBImpl : public BlobDB { } using BlobDB::MultiGet; - virtual std::vector MultiGet( - const ReadOptions& _read_options, const std::vector& keys, - std::vector* values) override; + void MultiGet(const ReadOptions& _read_options, size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool sorted_input) override; using BlobDB::Write; - virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + Status Write(const WriteOptions& opts, WriteBatch* updates) override; - virtual Status Close() override; + Status Close() override; using BlobDB::PutWithTTL; Status PutWithTTL(const WriteOptions& options, const Slice& key, @@ -154,20 +155,19 @@ class BlobDBImpl : public BlobDB { const DBOptions& db_options, const ColumnFamilyOptions& cf_options); - virtual Status DisableFileDeletions() override; + Status DisableFileDeletions() override; - virtual Status EnableFileDeletions(bool force) override; + Status EnableFileDeletions() override; - virtual Status GetLiveFiles(std::vector&, - uint64_t* manifest_file_size, - bool flush_memtable = true) override; - virtual void GetLiveFilesMetaData(std::vector*) override; + Status GetLiveFiles(std::vector&, uint64_t* manifest_file_size, + bool flush_memtable = true) override; + void GetLiveFilesMetaData(std::vector*) override; ~BlobDBImpl(); Status Open(std::vector* handles); - Status SyncBlobFiles() override; + Status SyncBlobFiles(const WriteOptions& write_options) override; // Common part of the two GetCompactionContext methods below. // REQUIRES: read lock on mutex_ @@ -245,11 +245,13 @@ class BlobDBImpl : public BlobDB { // to a single thread (like in the case of new files written during // compaction/GC), the locks on write_mutex_ and the blob file's mutex_ can be // avoided. - Status CloseBlobFile(std::shared_ptr bfile); + Status CloseBlobFile(const WriteOptions& write_options, + std::shared_ptr bfile); // Close a file if its size exceeds blob_file_size // REQUIRES: lock held on write_mutex_. - Status CloseBlobFileIfNeeded(std::shared_ptr& bfile); + Status CloseBlobFileIfNeeded(const WriteOptions& write_options, + std::shared_ptr& bfile); // Mark file as obsolete and move the file to obsolete file list. // @@ -261,13 +263,15 @@ class BlobDBImpl : public BlobDB { const Slice& value, uint64_t expiration, WriteBatch* batch); - Status AppendBlob(const std::shared_ptr& bfile, + Status AppendBlob(const WriteOptions& write_options, + const std::shared_ptr& bfile, const std::string& headerbuf, const Slice& key, const Slice& value, uint64_t expiration, std::string* index_entry); // Create a new blob file and associated writer. - Status CreateBlobFileAndWriter(bool has_ttl, + Status CreateBlobFileAndWriter(const WriteOptions& write_options, + bool has_ttl, const ExpirationRange& expiration_range, const std::string& reason, std::shared_ptr* blob_file, @@ -275,11 +279,13 @@ class BlobDBImpl : public BlobDB { // Get the open non-TTL blob log file, or create a new one if no such file // exists. - Status SelectBlobFile(std::shared_ptr* blob_file); + Status SelectBlobFile(const WriteOptions& write_options, + std::shared_ptr* blob_file); // Get the open TTL blob log file for a certain expiration, or create a new // one if no such file exists. - Status SelectBlobFileTTL(uint64_t expiration, + Status SelectBlobFileTTL(const WriteOptions& write_options, + uint64_t expiration, std::shared_ptr* blob_file); std::shared_ptr FindBlobFileLocked(uint64_t expiration) const; @@ -363,7 +369,7 @@ class BlobDBImpl : public BlobDB { void MarkUnreferencedBlobFilesObsolete(); void MarkUnreferencedBlobFilesObsoleteDuringOpen(); - void UpdateLiveSSTSize(); + void UpdateLiveSSTSize(const WriteOptions& write_options); Status GetBlobFileReader(const std::shared_ptr& blob_file, std::shared_ptr* reader); @@ -394,9 +400,12 @@ class BlobDBImpl : public BlobDB { // If is_fifo = true, FIFO eviction will be triggered to make room for the // new blob. If force_evict = true, FIFO eviction will evict blob files // even eviction will not make enough room for the new blob. - Status CheckSizeAndEvictBlobFiles(uint64_t blob_size, + Status CheckSizeAndEvictBlobFiles(const WriteOptions& write_options, + uint64_t blob_size, bool force_evict = false); + Status CloseImpl(); + // name of the database directory std::string dbname_; diff --git a/utilities/blob_db/blob_db_impl_filesnapshot.cc b/utilities/blob_db/blob_db_impl_filesnapshot.cc index da2d02d07a4..7f9189b12eb 100644 --- a/utilities/blob_db/blob_db_impl_filesnapshot.cc +++ b/utilities/blob_db/blob_db_impl_filesnapshot.cc @@ -12,8 +12,7 @@ // BlobDBImpl methods to get snapshot of files, e.g. for replication. -namespace ROCKSDB_NAMESPACE { -namespace blob_db { +namespace ROCKSDB_NAMESPACE::blob_db { Status BlobDBImpl::DisableFileDeletions() { // Disable base DB file deletions. @@ -35,9 +34,9 @@ Status BlobDBImpl::DisableFileDeletions() { return Status::OK(); } -Status BlobDBImpl::EnableFileDeletions(bool force) { +Status BlobDBImpl::EnableFileDeletions() { // Enable base DB file deletions. - Status s = db_impl_->EnableFileDeletions(force); + Status s = db_impl_->EnableFileDeletions(); if (!s.ok()) { return s; } @@ -45,9 +44,7 @@ Status BlobDBImpl::EnableFileDeletions(bool force) { int count = 0; { MutexLock l(&delete_file_mutex_); - if (force) { - disable_file_deletions_ = 0; - } else if (disable_file_deletions_ > 0) { + if (disable_file_deletions_ > 0) { count = --disable_file_deletions_; } assert(count >= 0); @@ -74,7 +71,7 @@ Status BlobDBImpl::GetLiveFiles(std::vector& ret, return s; } ret.reserve(ret.size() + blob_files_.size()); - for (auto bfile_pair : blob_files_) { + for (const auto& bfile_pair : blob_files_) { auto blob_file = bfile_pair.second; // Path should be relative to db_name, but begin with slash. ret.emplace_back( @@ -89,7 +86,7 @@ void BlobDBImpl::GetLiveFilesMetaData(std::vector* metadata) { // Hold a lock in the beginning to avoid updates to base DB during the call ReadLock rl(&mutex_); db_->GetLiveFilesMetaData(metadata); - for (auto bfile_pair : blob_files_) { + for (const auto& bfile_pair : blob_files_) { auto blob_file = bfile_pair.second; LiveFileMetaData filemetadata; filemetadata.size = blob_file->GetFileSize(); @@ -107,5 +104,4 @@ void BlobDBImpl::GetLiveFilesMetaData(std::vector* metadata) { } } -} // namespace blob_db -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::blob_db diff --git a/utilities/blob_db/blob_db_listener.h b/utilities/blob_db/blob_db_listener.h index c95740c50e7..0759b68114d 100644 --- a/utilities/blob_db/blob_db_listener.h +++ b/utilities/blob_db/blob_db_listener.h @@ -22,18 +22,20 @@ class BlobDBListener : public EventListener { void OnFlushBegin(DB* /*db*/, const FlushJobInfo& /*info*/) override { assert(blob_db_impl_ != nullptr); - blob_db_impl_->SyncBlobFiles().PermitUncheckedError(); + blob_db_impl_->SyncBlobFiles(WriteOptions(Env::IOActivity::kFlush)) + .PermitUncheckedError(); } void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& /*info*/) override { assert(blob_db_impl_ != nullptr); - blob_db_impl_->UpdateLiveSSTSize(); + blob_db_impl_->UpdateLiveSSTSize(WriteOptions(Env::IOActivity::kFlush)); } void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& /*info*/) override { assert(blob_db_impl_ != nullptr); - blob_db_impl_->UpdateLiveSSTSize(); + blob_db_impl_->UpdateLiveSSTSize( + WriteOptions(Env::IOActivity::kCompaction)); } const char* Name() const override { return kClassName(); } diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc index 57c0411caae..4084450c2c0 100644 --- a/utilities/blob_db/blob_db_test.cc +++ b/utilities/blob_db/blob_db_test.cc @@ -31,8 +31,7 @@ #include "utilities/blob_db/blob_db_impl.h" #include "utilities/fault_injection_env.h" -namespace ROCKSDB_NAMESPACE { -namespace blob_db { +namespace ROCKSDB_NAMESPACE::blob_db { class BlobDBTest : public testing::Test { public: @@ -118,9 +117,7 @@ class BlobDBTest : public testing::Test { } } - BlobDBImpl *blob_db_impl() { - return reinterpret_cast(blob_db_); - } + BlobDBImpl *blob_db_impl() { return static_cast(blob_db_); } Status Put(const Slice &key, const Slice &value, std::map *data = nullptr) { @@ -605,7 +602,7 @@ TEST_F(BlobDBTest, EnableDisableCompressionGC) { VerifyDB(data); blob_files = blob_db_impl()->TEST_GetBlobFiles(); - for (auto bfile : blob_files) { + for (const auto &bfile : blob_files) { ASSERT_EQ(kNoCompression, bfile->GetCompressionType()); } @@ -625,7 +622,7 @@ TEST_F(BlobDBTest, EnableDisableCompressionGC) { VerifyDB(data); blob_files = blob_db_impl()->TEST_GetBlobFiles(); - for (auto bfile : blob_files) { + for (const auto &bfile : blob_files) { ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType()); } } @@ -672,7 +669,7 @@ TEST_F(BlobDBTest, ChangeCompressionGC) { blob_db_impl()->TEST_DeleteObsoleteFiles(); blob_files = blob_db_impl()->TEST_GetBlobFiles(); - for (auto bfile : blob_files) { + for (const auto &bfile : blob_files) { ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType()); } @@ -689,7 +686,7 @@ TEST_F(BlobDBTest, ChangeCompressionGC) { blob_db_impl()->TEST_DeleteObsoleteFiles(); blob_files = blob_db_impl()->TEST_GetBlobFiles(); - for (auto bfile : blob_files) { + for (const auto &bfile : blob_files) { ASSERT_EQ(kNoCompression, bfile->GetCompressionType()); } @@ -713,7 +710,7 @@ TEST_F(BlobDBTest, ChangeCompressionGC) { blob_db_impl()->TEST_DeleteObsoleteFiles(); blob_files = blob_db_impl()->TEST_GetBlobFiles(); - for (auto bfile : blob_files) { + for (const auto &bfile : blob_files) { ASSERT_EQ(kLZ4Compression, bfile->GetCompressionType()); } } @@ -725,8 +722,8 @@ TEST_F(BlobDBTest, MultipleWriters) { std::vector workers; std::vector> data_set(10); - for (uint32_t i = 0; i < 10; i++) - workers.push_back(port::Thread( + for (uint32_t i = 0; i < 10; i++) { + workers.emplace_back( [&](uint32_t id) { Random rnd(301 + id); for (int j = 0; j < 100; j++) { @@ -741,7 +738,8 @@ TEST_F(BlobDBTest, MultipleWriters) { } } }, - i)); + i); + } std::map data; for (size_t i = 0; i < 10; i++) { workers[i].join(); @@ -1369,8 +1367,8 @@ TEST_F(BlobDBTest, UserCompactionFilter) { constexpr uint64_t kMinValueSize = 1 << 6; constexpr uint64_t kMaxValueSize = 1 << 8; constexpr uint64_t kMinBlobSize = 1 << 7; - static_assert(kMinValueSize < kMinBlobSize, ""); - static_assert(kMaxValueSize > kMinBlobSize, ""); + static_assert(kMinValueSize < kMinBlobSize); + static_assert(kMaxValueSize > kMinBlobSize); BlobDBOptions bdb_options; bdb_options.min_blob_size = kMinBlobSize; @@ -1741,8 +1739,8 @@ TEST_F(BlobDBTest, GarbageCollection) { constexpr uint64_t kSmallValueSize = 1 << 6; constexpr uint64_t kLargeValueSize = 1 << 8; constexpr uint64_t kMinBlobSize = 1 << 7; - static_assert(kSmallValueSize < kMinBlobSize, ""); - static_assert(kLargeValueSize > kMinBlobSize, ""); + static_assert(kSmallValueSize < kMinBlobSize); + static_assert(kLargeValueSize > kMinBlobSize); constexpr size_t kBlobsPerFile = 8; constexpr size_t kNumBlobFiles = kNumPuts / kBlobsPerFile; @@ -1993,7 +1991,7 @@ TEST_F(BlobDBTest, EvictExpiredFile) { ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size()); // Make sure we don't return garbage value after blob file being evicted, // but the blob index still exists in the LSM tree. - std::string val = ""; + std::string val; ASSERT_TRUE(blob_db_->Get(ReadOptions(), "foo", &val).IsNotFound()); ASSERT_EQ("", val); } @@ -2003,40 +2001,36 @@ TEST_F(BlobDBTest, DisableFileDeletions) { bdb_options.disable_background_tasks = true; Open(bdb_options); std::map data; - for (bool force : {true, false}) { - ASSERT_OK(Put("foo", "v", &data)); - auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); - ASSERT_EQ(1, blob_files.size()); - auto blob_file = blob_files[0]; - ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file)); - blob_db_impl()->TEST_ObsoleteBlobFile(blob_file); - ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); - ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size()); - // Call DisableFileDeletions twice. - ASSERT_OK(blob_db_->DisableFileDeletions()); - ASSERT_OK(blob_db_->DisableFileDeletions()); - // File deletions should be disabled. - blob_db_impl()->TEST_DeleteObsoleteFiles(); - ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); - ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size()); - VerifyDB(data); - // Enable file deletions once. If force=true, file deletion is enabled. - // Otherwise it needs to enable it for a second time. - ASSERT_OK(blob_db_->EnableFileDeletions(force)); - blob_db_impl()->TEST_DeleteObsoleteFiles(); - if (!force) { - ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); - ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size()); - VerifyDB(data); - // Call EnableFileDeletions a second time. - ASSERT_OK(blob_db_->EnableFileDeletions(/*force=*/false)); - blob_db_impl()->TEST_DeleteObsoleteFiles(); - } - // Regardless of value of `force`, file should be deleted by now. - ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size()); - ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size()); - VerifyDB({}); - } + ASSERT_OK(Put("foo", "v", &data)); + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + auto blob_file = blob_files[0]; + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file)); + blob_db_impl()->TEST_ObsoleteBlobFile(blob_file); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size()); + // Call DisableFileDeletions twice. + ASSERT_OK(blob_db_->DisableFileDeletions()); + ASSERT_OK(blob_db_->DisableFileDeletions()); + // File deletions should be disabled. + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size()); + VerifyDB(data); + // Enable file deletions once. File deletion will later get enabled when + // `EnableFileDeletions` called for a second time. + ASSERT_OK(blob_db_->EnableFileDeletions()); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size()); + VerifyDB(data); + // Call EnableFileDeletions a second time. + ASSERT_OK(blob_db_->EnableFileDeletions()); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + // File should be deleted by now. + ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size()); + VerifyDB({}); } TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) { @@ -2411,8 +2405,7 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) { ASSERT_TRUE(s.IsIOError()); } -} // namespace blob_db -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::blob_db // A black-box test for the ttl wrapper around rocksdb int main(int argc, char **argv) { diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc index 0c2fef5e156..933803f8f30 100644 --- a/utilities/blob_db/blob_dump_tool.cc +++ b/utilities/blob_db/blob_dump_tool.cc @@ -5,9 +5,8 @@ #include "utilities/blob_db/blob_dump_tool.h" -#include - #include +#include #include #include #include @@ -21,8 +20,7 @@ #include "util/coding.h" #include "util/string_util.h" -namespace ROCKSDB_NAMESPACE { -namespace blob_db { +namespace ROCKSDB_NAMESPACE::blob_db { BlobDumpTool::BlobDumpTool() : reader_(nullptr), buffer_(nullptr), buffer_size_(0) {} @@ -275,5 +273,4 @@ std::string BlobDumpTool::GetString(std::pair p) { return "(" + std::to_string(p.first) + ", " + std::to_string(p.second) + ")"; } -} // namespace blob_db -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::blob_db diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc index 5b31d569732..5a479dc8bd4 100644 --- a/utilities/blob_db/blob_file.cc +++ b/utilities/blob_db/blob_file.cc @@ -5,10 +5,9 @@ // (found in the LICENSE.Apache file in the root directory). #include "utilities/blob_db/blob_file.h" -#include - #include #include +#include #include #include "db/column_family.h" @@ -19,9 +18,7 @@ #include "logging/logging.h" #include "utilities/blob_db/blob_db_impl.h" -namespace ROCKSDB_NAMESPACE { - -namespace blob_db { +namespace ROCKSDB_NAMESPACE::blob_db { BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn, Logger* info_log) @@ -78,7 +75,8 @@ void BlobFile::MarkObsolete(SequenceNumber sequence) { obsolete_.store(true); } -Status BlobFile::WriteFooterAndCloseLocked(SequenceNumber sequence) { +Status BlobFile::WriteFooterAndCloseLocked(const WriteOptions& write_options, + SequenceNumber sequence) { BlobLogFooter footer; footer.blob_count = blob_count_; if (HasTTL()) { @@ -86,7 +84,8 @@ Status BlobFile::WriteFooterAndCloseLocked(SequenceNumber sequence) { } // this will close the file and reset the Writable File Pointer. - Status s = log_writer_->AppendFooter(footer, /* checksum_method */ nullptr, + Status s = log_writer_->AppendFooter(write_options, footer, + /* checksum_method */ nullptr, /* checksum_value */ nullptr); if (s.ok()) { closed_ = true; @@ -118,9 +117,11 @@ Status BlobFile::ReadFooter(BlobLogFooter* bf) { } else { buf.reserve(BlobLogFooter::kSize + 10); s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize, - &result, &buf[0], nullptr); + &result, buf.data(), nullptr); + } + if (!s.ok()) { + return s; } - if (!s.ok()) return s; if (result.size() != BlobLogFooter::kSize) { // should not happen return Status::IOError("EOF reached before footer"); @@ -137,10 +138,10 @@ Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) { return Status::OK(); } -Status BlobFile::Fsync() { +Status BlobFile::Fsync(const WriteOptions& write_options) { Status s; if (log_writer_.get()) { - s = log_writer_->Sync(); + s = log_writer_->Sync(write_options); } return s; } @@ -240,7 +241,7 @@ Status BlobFile::ReadMetadata(const std::shared_ptr& fs, } else { header_buf.reserve(BlobLogHeader::kSize); s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice, - &header_buf[0], nullptr); + header_buf.data(), nullptr); } if (!s.ok()) { ROCKS_LOG_ERROR( @@ -281,8 +282,8 @@ Status BlobFile::ReadMetadata(const std::shared_ptr& fs, } else { footer_buf.reserve(BlobLogFooter::kSize); s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize, - BlobLogFooter::kSize, &footer_slice, &footer_buf[0], - nullptr); + BlobLogFooter::kSize, &footer_slice, + footer_buf.data(), nullptr); } if (!s.ok()) { ROCKS_LOG_ERROR( @@ -307,5 +308,4 @@ Status BlobFile::ReadMetadata(const std::shared_ptr& fs, return Status::OK(); } -} // namespace blob_db -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::blob_db diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h index 8651c6b6728..f0ec83ebe8a 100644 --- a/utilities/blob_db/blob_file.h +++ b/utilities/blob_db/blob_file.h @@ -180,7 +180,7 @@ class BlobFile { return obsolete_sequence_; } - Status Fsync(); + Status Fsync(const WriteOptions& write_options); uint64_t GetFileSize() const { return file_size_.load(std::memory_order_acquire); @@ -218,7 +218,8 @@ class BlobFile { private: Status ReadFooter(BlobLogFooter* footer); - Status WriteFooterAndCloseLocked(SequenceNumber sequence); + Status WriteFooterAndCloseLocked(const WriteOptions& write_options, + SequenceNumber sequence); void CloseRandomAccessLocked(); diff --git a/utilities/cache_dump_load_impl.h b/utilities/cache_dump_load_impl.h index 59cabbf3b68..4ec42ef633f 100644 --- a/utilities/cache_dump_load_impl.h +++ b/utilities/cache_dump_load_impl.h @@ -40,7 +40,7 @@ enum CacheDumpUnitType : unsigned char { kBlockTypeMax, }; -// The metadata of a dump unit. After it is serilized, its size is fixed 16 +// The metadata of a dump unit. After it is serialized, its size is fixed 16 // bytes. struct DumpUnitMeta { // sequence number is a monotonically increasing number to indicate the order @@ -48,7 +48,7 @@ struct DumpUnitMeta { uint32_t sequence_num; // The Crc32c checksum of its dump unit. uint32_t dump_unit_checksum; - // The dump unit size after the dump unit is serilized to a string. + // The dump unit size after the dump unit is serialized to a string. uint64_t dump_unit_size; void reset() { @@ -158,33 +158,35 @@ class ToFileCacheDumpWriter : public CacheDumpWriter { ~ToFileCacheDumpWriter() { Close().PermitUncheckedError(); } // Write the serialized metadata to the file - virtual IOStatus WriteMetadata(const Slice& metadata) override { + IOStatus WriteMetadata(const Slice& metadata) override { assert(file_writer_ != nullptr); std::string prefix; PutFixed32(&prefix, static_cast(metadata.size())); - IOStatus io_s = file_writer_->Append(Slice(prefix)); + const IOOptions opts; + IOStatus io_s = file_writer_->Append(opts, Slice(prefix)); if (!io_s.ok()) { return io_s; } - io_s = file_writer_->Append(metadata); + io_s = file_writer_->Append(opts, metadata); return io_s; } // Write the serialized data to the file - virtual IOStatus WritePacket(const Slice& data) override { + IOStatus WritePacket(const Slice& data) override { assert(file_writer_ != nullptr); std::string prefix; PutFixed32(&prefix, static_cast(data.size())); - IOStatus io_s = file_writer_->Append(Slice(prefix)); + const IOOptions opts; + IOStatus io_s = file_writer_->Append(opts, Slice(prefix)); if (!io_s.ok()) { return io_s; } - io_s = file_writer_->Append(data); + io_s = file_writer_->Append(opts, data); return io_s; } // Reset the writer - virtual IOStatus Close() override { + IOStatus Close() override { file_writer_.reset(); return IOStatus::OK(); } @@ -206,7 +208,7 @@ class FromFileCacheDumpReader : public CacheDumpReader { ~FromFileCacheDumpReader() { delete[] buffer_; } - virtual IOStatus ReadMetadata(std::string* metadata) override { + IOStatus ReadMetadata(std::string* metadata) override { uint32_t metadata_len = 0; IOStatus io_s = ReadSizePrefix(&metadata_len); if (!io_s.ok()) { @@ -215,7 +217,7 @@ class FromFileCacheDumpReader : public CacheDumpReader { return Read(metadata_len, metadata); } - virtual IOStatus ReadPacket(std::string* data) override { + IOStatus ReadPacket(std::string* data) override { uint32_t data_len = 0; IOStatus io_s = ReadSizePrefix(&data_len); if (!io_s.ok()) { diff --git a/utilities/cassandra/cassandra_compaction_filter.cc b/utilities/cassandra/cassandra_compaction_filter.cc index b7da2ba0cbc..21a81e19605 100644 --- a/utilities/cassandra/cassandra_compaction_filter.cc +++ b/utilities/cassandra/cassandra_compaction_filter.cc @@ -13,8 +13,7 @@ #include "utilities/cassandra/format.h" #include "utilities/cassandra/merge_operator.h" -namespace ROCKSDB_NAMESPACE { -namespace cassandra { +namespace ROCKSDB_NAMESPACE::cassandra { static std::unordered_map cassandra_filter_type_info = { {"purge_ttl_on_expiration", @@ -102,5 +101,4 @@ int RegisterCassandraObjects(ObjectLibrary& library, size_t num_types; return static_cast(library.GetFactoryCount(&num_types)); } -} // namespace cassandra -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::cassandra diff --git a/utilities/cassandra/cassandra_compaction_filter.h b/utilities/cassandra/cassandra_compaction_filter.h index 0325a4c3957..31701e4b7ef 100644 --- a/utilities/cassandra/cassandra_compaction_filter.h +++ b/utilities/cassandra/cassandra_compaction_filter.h @@ -31,9 +31,9 @@ class CassandraCompactionFilter : public CompactionFilter { static const char* kClassName() { return "CassandraCompactionFilter"; } const char* Name() const override { return kClassName(); } - virtual Decision FilterV2(int level, const Slice& key, ValueType value_type, - const Slice& existing_value, std::string* new_value, - std::string* skip_until) const override; + Decision FilterV2(int level, const Slice& key, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* skip_until) const override; private: CassandraOptions options_; diff --git a/utilities/cassandra/cassandra_format_test.cc b/utilities/cassandra/cassandra_format_test.cc index 4f12947ad9c..128dad4feae 100644 --- a/utilities/cassandra/cassandra_format_test.cc +++ b/utilities/cassandra/cassandra_format_test.cc @@ -11,8 +11,7 @@ #include "utilities/cassandra/serialize.h" #include "utilities/cassandra/test_utils.h" -namespace ROCKSDB_NAMESPACE { -namespace cassandra { +namespace ROCKSDB_NAMESPACE::cassandra { TEST(ColumnTest, Column) { char data[4] = {'d', 'a', 't', 'a'}; @@ -367,8 +366,7 @@ TEST(RowValueTest, ExpireTtlShouldConvertExpiredColumnsToTombstones) { compacted.ConvertExpiredColumnsToTombstones(&changed); EXPECT_FALSE(changed); } -} // namespace cassandra -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::cassandra int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc index e3266a0dc11..28fba3acb88 100644 --- a/utilities/cassandra/cassandra_functional_test.cc +++ b/utilities/cassandra/cassandra_functional_test.cc @@ -18,8 +18,7 @@ #include "utilities/cassandra/test_utils.h" #include "utilities/merge_operators.h" -namespace ROCKSDB_NAMESPACE { -namespace cassandra { +namespace ROCKSDB_NAMESPACE::cassandra { // Path to the database on file system const std::string kDbName = test::PerThreadDBPath("cassandra_functional_test"); @@ -434,8 +433,7 @@ TEST_F(CassandraFunctionalTest, LoadCompactionFilterFactory) { ASSERT_TRUE(opts->purge_ttl_on_expiration); } -} // namespace cassandra -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::cassandra int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff --git a/utilities/cassandra/cassandra_row_merge_test.cc b/utilities/cassandra/cassandra_row_merge_test.cc index 0b4a8928717..c54398458df 100644 --- a/utilities/cassandra/cassandra_row_merge_test.cc +++ b/utilities/cassandra/cassandra_row_merge_test.cc @@ -9,8 +9,7 @@ #include "utilities/cassandra/format.h" #include "utilities/cassandra/test_utils.h" -namespace ROCKSDB_NAMESPACE { -namespace cassandra { +namespace ROCKSDB_NAMESPACE::cassandra { class RowValueMergeTest : public testing::Test {}; @@ -88,8 +87,7 @@ TEST(RowValueMergeTest, MergeWithRowTombstone) { EXPECT_EQ(merged.LastModifiedTime(), 17); } -} // namespace cassandra -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::cassandra int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff --git a/utilities/cassandra/cassandra_serialize_test.cc b/utilities/cassandra/cassandra_serialize_test.cc index c14d8fd809d..f05fd440854 100644 --- a/utilities/cassandra/cassandra_serialize_test.cc +++ b/utilities/cassandra/cassandra_serialize_test.cc @@ -6,8 +6,7 @@ #include "test_util/testharness.h" #include "utilities/cassandra/serialize.h" -namespace ROCKSDB_NAMESPACE { -namespace cassandra { +namespace ROCKSDB_NAMESPACE::cassandra { TEST(SerializeTest, SerializeI64) { std::string dest; @@ -154,8 +153,7 @@ TEST(SerializeTest, DeserializeI8) { EXPECT_EQ(-128, Deserialize(dest.c_str(), offset)); } -} // namespace cassandra -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::cassandra int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff --git a/utilities/cassandra/format.cc b/utilities/cassandra/format.cc index cc1dd2f2803..dc2548bd949 100644 --- a/utilities/cassandra/format.cc +++ b/utilities/cassandra/format.cc @@ -11,8 +11,7 @@ #include "utilities/cassandra/serialize.h" -namespace ROCKSDB_NAMESPACE { -namespace cassandra { +namespace ROCKSDB_NAMESPACE::cassandra { namespace { const int32_t kDefaultLocalDeletionTime = std::numeric_limits::max(); const int64_t kDefaultMarkedForDeleteAt = std::numeric_limits::min(); @@ -363,5 +362,4 @@ RowValue RowValue::Merge(std::vector&& values) { return RowValue(std::move(columns), last_modified_time); } -} // namespace cassandra -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::cassandra diff --git a/utilities/cassandra/format.h b/utilities/cassandra/format.h index 1b27147351a..7d6f6e45be0 100644 --- a/utilities/cassandra/format.h +++ b/utilities/cassandra/format.h @@ -94,9 +94,9 @@ class Column : public ColumnBase { Column(int8_t mask, int8_t index, int64_t timestamp, int32_t value_size, const char* value); - virtual int64_t Timestamp() const override; - virtual std::size_t Size() const override; - virtual void Serialize(std::string* dest) const override; + int64_t Timestamp() const override; + std::size_t Size() const override; + void Serialize(std::string* dest) const override; static std::shared_ptr Deserialize(const char* src, std::size_t offset); @@ -111,9 +111,9 @@ class Tombstone : public ColumnBase { Tombstone(int8_t mask, int8_t index, int32_t local_deletion_time, int64_t marked_for_delete_at); - virtual int64_t Timestamp() const override; - virtual std::size_t Size() const override; - virtual void Serialize(std::string* dest) const override; + int64_t Timestamp() const override; + std::size_t Size() const override; + void Serialize(std::string* dest) const override; bool Collectable(int32_t gc_grace_period) const; static std::shared_ptr Deserialize(const char* src, std::size_t offset); @@ -128,8 +128,8 @@ class ExpiringColumn : public Column { ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp, int32_t value_size, const char* value, int32_t ttl); - virtual std::size_t Size() const override; - virtual void Serialize(std::string* dest) const override; + std::size_t Size() const override; + void Serialize(std::string* dest) const override; bool Expired() const; std::shared_ptr ToTombstone() const; diff --git a/utilities/cassandra/merge_operator.cc b/utilities/cassandra/merge_operator.cc index 366d8fa4435..5c9727f74f6 100644 --- a/utilities/cassandra/merge_operator.cc +++ b/utilities/cassandra/merge_operator.cc @@ -5,8 +5,7 @@ #include "merge_operator.h" -#include - +#include #include #include "rocksdb/merge_operator.h" @@ -15,8 +14,7 @@ #include "utilities/cassandra/format.h" #include "utilities/merge_operators.h" -namespace ROCKSDB_NAMESPACE { -namespace cassandra { +namespace ROCKSDB_NAMESPACE::cassandra { static std::unordered_map merge_operator_options_info = { {"gc_grace_period_in_seconds", @@ -75,6 +73,4 @@ bool CassandraValueMergeOperator::PartialMergeMulti( return true; } -} // namespace cassandra - -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::cassandra diff --git a/utilities/cassandra/merge_operator.h b/utilities/cassandra/merge_operator.h index af8725db7de..c6d738b957f 100644 --- a/utilities/cassandra/merge_operator.h +++ b/utilities/cassandra/merge_operator.h @@ -19,20 +19,19 @@ class CassandraValueMergeOperator : public MergeOperator { explicit CassandraValueMergeOperator(int32_t gc_grace_period_in_seconds, size_t operands_limit = 0); - virtual bool FullMergeV2(const MergeOperationInput& merge_in, - MergeOperationOutput* merge_out) const override; + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; - virtual bool PartialMergeMulti(const Slice& key, - const std::deque& operand_list, - std::string* new_value, - Logger* logger) const override; + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const override; const char* Name() const override { return kClassName(); } static const char* kClassName() { return "CassandraValueMergeOperator"; } - virtual bool AllowSingleOperand() const override { return true; } + bool AllowSingleOperand() const override { return true; } - virtual bool ShouldMerge(const std::vector& operands) const override { + bool ShouldMerge(const std::vector& operands) const override { return options_.operands_limit > 0 && operands.size() >= options_.operands_limit; } diff --git a/utilities/cassandra/test_utils.cc b/utilities/cassandra/test_utils.cc index ec6e5752d0a..3615813500a 100644 --- a/utilities/cassandra/test_utils.cc +++ b/utilities/cassandra/test_utils.cc @@ -5,8 +5,7 @@ #include "test_utils.h" -namespace ROCKSDB_NAMESPACE { -namespace cassandra { +namespace ROCKSDB_NAMESPACE::cassandra { const char kData[] = {'d', 'a', 't', 'a'}; const char kExpiringData[] = {'e', 'd', 'a', 't', 'a'}; const int32_t kTtl = 86400; @@ -65,5 +64,4 @@ int64_t ToMicroSeconds(int64_t seconds) { return seconds * (int64_t)1000000; } int32_t ToSeconds(int64_t microseconds) { return (int32_t)(microseconds / (int64_t)1000000); } -} // namespace cassandra -} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE::cassandra diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index e1f09451309..f29b02f7741 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -135,8 +135,9 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir, const Temperature temperature) { ROCKS_LOG_INFO(db_options.info_log, "Copying %s", fname.c_str()); return CopyFile(db_->GetFileSystem(), src_dirname + "/" + fname, - full_private_path + "/" + fname, size_limit_bytes, - db_options.use_fsync, nullptr, temperature); + temperature, full_private_path + "/" + fname, + temperature, size_limit_bytes, db_options.use_fsync, + nullptr); } /* copy_file_cb */, [&](const std::string& fname, const std::string& contents, FileType) { ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str()); @@ -148,7 +149,7 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir, // we copied all the files, enable file deletions if (disabled_file_deletions) { - Status ss = db_->EnableFileDeletions(/*force=*/false); + Status ss = db_->EnableFileDeletions(); assert(ss.ok()); ss.PermitUncheckedError(); } @@ -332,12 +333,14 @@ Status CheckpointImpl::ExportColumnFamily( [&](const std::string& src_dirname, const std::string& fname) { ROCKS_LOG_INFO(db_options.info_log, "[%s] Copying %s", cf_name.c_str(), fname.c_str()); + // FIXME: temperature handling return CopyFile(db_->GetFileSystem(), src_dirname + fname, - tmp_export_dir + fname, 0, db_options.use_fsync, - nullptr, Temperature::kUnknown); + Temperature::kUnknown, tmp_export_dir + fname, + Temperature::kUnknown, 0, db_options.use_fsync, + nullptr); } /*copy_file_cb*/); - const auto enable_status = db_->EnableFileDeletions(/*force=*/false); + const auto enable_status = db_->EnableFileDeletions(); if (s.ok()) { s = enable_status; } @@ -467,4 +470,3 @@ Status CheckpointImpl::ExportFilesInMetaData( return s; } } // namespace ROCKSDB_NAMESPACE - diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index a9cea1c058f..cdda1c05959 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -112,7 +112,7 @@ class CheckpointTest : public testing::Test { ColumnFamilyOptions cf_opts(options); size_t cfi = handles_.size(); handles_.resize(cfi + cfs.size()); - for (auto cf : cfs) { + for (const auto& cf : cfs) { ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); } } @@ -141,7 +141,7 @@ class CheckpointTest : public testing::Test { EXPECT_EQ(cfs.size(), options.size()); std::vector column_families; for (size_t i = 0; i < cfs.size(); ++i) { - column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i])); + column_families.emplace_back(cfs[i], options[i]); } DBOptions db_opts = DBOptions(options[0]); return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); @@ -507,7 +507,7 @@ TEST_F(CheckpointTest, CheckpointCF) { cfs = {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"}; std::vector column_families; for (size_t i = 0; i < cfs.size(); ++i) { - column_families.push_back(ColumnFamilyDescriptor(cfs[i], options)); + column_families.emplace_back(cfs[i], options); } ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles, &snapshotDB)); @@ -565,7 +565,7 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) { cfs = {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"}; std::vector column_families; for (size_t i = 0; i < cfs.size(); ++i) { - column_families.push_back(ColumnFamilyDescriptor(cfs[i], options)); + column_families.emplace_back(cfs[i], options); } ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles, &snapshotDB)); @@ -717,12 +717,9 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) { TransactionDB* snapshotDB; std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); - column_families.push_back( - ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); - column_families.push_back( - ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + column_families.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); + column_families.emplace_back("CFA", ColumnFamilyOptions()); + column_families.emplace_back("CFB", ColumnFamilyOptions()); std::vector cf_handles; ASSERT_OK(TransactionDB::Open(options, txn_db_options, snapshot_name_, column_families, &cf_handles, &snapshotDB)); diff --git a/utilities/debug.cc b/utilities/debug.cc index 911bc510a6a..6274cd6c2d0 100644 --- a/utilities/debug.cc +++ b/utilities/debug.cc @@ -38,7 +38,11 @@ static std::unordered_map value_type_string_map = { {"TypeCommitXIDAndTimestamp", ValueType::kTypeCommitXIDAndTimestamp}, {"TypeWideColumnEntity", ValueType::kTypeWideColumnEntity}, {"TypeColumnFamilyWideColumnEntity", - ValueType::kTypeColumnFamilyWideColumnEntity}}; + ValueType::kTypeColumnFamilyWideColumnEntity}, + {"TypeValuePreferredSeqno", ValueType::kTypeValuePreferredSeqno}, + {"TypeColumnFamilyValuePreferredSeqno", + ValueType::kTypeColumnFamilyValuePreferredSeqno}, +}; std::string KeyVersion::GetTypeName() const { std::string type_name; @@ -78,7 +82,7 @@ Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key, auto icmp = InternalKeyComparator(idb->GetOptions(cfh).comparator); ReadOptions read_options; Arena arena; - ScopedArenaIterator iter( + ScopedArenaPtr iter( idb->NewInternalIterator(read_options, &arena, kMaxSequenceNumber, cfh)); if (!begin_key.empty()) { diff --git a/utilities/env_mirror.cc b/utilities/env_mirror.cc index 0802d7c708f..b8c74e5a71c 100644 --- a/utilities/env_mirror.cc +++ b/utilities/env_mirror.cc @@ -59,7 +59,7 @@ class SequentialFileMirror : public SequentialFile { Status bs = b_->InvalidateCache(offset, length); assert(as == bs); return as; - }; + } }; class RandomAccessFileMirror : public RandomAccessFile { @@ -214,10 +214,11 @@ Status EnvMirror::NewSequentialFile(const std::string& f, Status as = a_->NewSequentialFile(f, &mf->a_, options); Status bs = b_->NewSequentialFile(f, &mf->b_, options); assert(as == bs); - if (as.ok()) + if (as.ok()) { r->reset(mf); - else + } else { delete mf; + } return as; } @@ -231,25 +232,29 @@ Status EnvMirror::NewRandomAccessFile(const std::string& f, Status as = a_->NewRandomAccessFile(f, &mf->a_, options); Status bs = b_->NewRandomAccessFile(f, &mf->b_, options); assert(as == bs); - if (as.ok()) + if (as.ok()) { r->reset(mf); - else + } else { delete mf; + } return as; } Status EnvMirror::NewWritableFile(const std::string& f, std::unique_ptr* r, const EnvOptions& options) { - if (f.find("/proc/") == 0) return a_->NewWritableFile(f, r, options); + if (f.find("/proc/") == 0) { + return a_->NewWritableFile(f, r, options); + } WritableFileMirror* mf = new WritableFileMirror(f, options); Status as = a_->NewWritableFile(f, &mf->a_, options); Status bs = b_->NewWritableFile(f, &mf->b_, options); assert(as == bs); - if (as.ok()) + if (as.ok()) { r->reset(mf); - else + } else { delete mf; + } return as; } @@ -257,16 +262,18 @@ Status EnvMirror::ReuseWritableFile(const std::string& fname, const std::string& old_fname, std::unique_ptr* r, const EnvOptions& options) { - if (fname.find("/proc/") == 0) + if (fname.find("/proc/") == 0) { return a_->ReuseWritableFile(fname, old_fname, r, options); + } WritableFileMirror* mf = new WritableFileMirror(fname, options); Status as = a_->ReuseWritableFile(fname, old_fname, &mf->a_, options); Status bs = b_->ReuseWritableFile(fname, old_fname, &mf->b_, options); assert(as == bs); - if (as.ok()) + if (as.ok()) { r->reset(mf); - else + } else { delete mf; + } return as; } diff --git a/utilities/fault_injection_env.cc b/utilities/fault_injection_env.cc index b0495a8c182..fb443cc87f3 100644 --- a/utilities/fault_injection_env.cc +++ b/utilities/fault_injection_env.cc @@ -71,7 +71,7 @@ Status Truncate(Env* env, const std::string& filename, uint64_t length) { // Trim the tailing "/" in the end of `str` std::string TrimDirname(const std::string& str) { - size_t found = str.find_last_not_of("/"); + size_t found = str.find_last_not_of('/'); if (found == std::string::npos) { return str; } @@ -528,7 +528,7 @@ Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() { } for (auto& pair : map_copy) { - for (std::string name : pair.second) { + for (const std::string& name : pair.second) { Status s = DeleteFile(pair.first + "/" + name); if (!s.ok()) { return s; diff --git a/utilities/fault_injection_env.h b/utilities/fault_injection_env.h index 6c1623a8d30..5612718c6c7 100644 --- a/utilities/fault_injection_env.h +++ b/utilities/fault_injection_env.h @@ -72,30 +72,25 @@ class TestWritableFile : public WritableFile { std::unique_ptr&& f, FaultInjectionTestEnv* env); virtual ~TestWritableFile(); - virtual Status Append(const Slice& data) override; - virtual Status Append( - const Slice& data, - const DataVerificationInfo& /*verification_info*/) override { + Status Append(const Slice& data) override; + Status Append(const Slice& data, + const DataVerificationInfo& /*verification_info*/) override { return Append(data); } - virtual Status Truncate(uint64_t size) override { - return target_->Truncate(size); - } - virtual Status Close() override; - virtual Status Flush() override; - virtual Status Sync() override; - virtual bool IsSyncThreadSafe() const override { return true; } - virtual Status PositionedAppend(const Slice& data, uint64_t offset) override { + Status Truncate(uint64_t size) override { return target_->Truncate(size); } + Status Close() override; + Status Flush() override; + Status Sync() override; + bool IsSyncThreadSafe() const override { return true; } + Status PositionedAppend(const Slice& data, uint64_t offset) override { return target_->PositionedAppend(data, offset); } - virtual Status PositionedAppend( + Status PositionedAppend( const Slice& data, uint64_t offset, const DataVerificationInfo& /*verification_info*/) override { return PositionedAppend(data, offset); } - virtual bool use_direct_io() const override { - return target_->use_direct_io(); - }; + bool use_direct_io() const override { return target_->use_direct_io(); } uint64_t GetFileSize() final { return target_->GetFileSize(); } private: @@ -122,7 +117,7 @@ class TestRandomRWFile : public RandomRWFile { size_t GetRequiredBufferAlignment() const override { return target_->GetRequiredBufferAlignment(); } - bool use_direct_io() const override { return target_->use_direct_io(); }; + bool use_direct_io() const override { return target_->use_direct_io(); } private: std::unique_ptr target_; @@ -137,8 +132,8 @@ class TestDirectory : public Directory { : env_(env), dirname_(dirname), dir_(dir) {} ~TestDirectory() {} - virtual Status Fsync() override; - virtual Status Close() override; + Status Fsync() override; + Status Close() override; private: FaultInjectionTestEnv* env_; @@ -174,17 +169,15 @@ class FaultInjectionTestEnv : public EnvWrapper { std::unique_ptr* result, const EnvOptions& soptions) override; - virtual Status DeleteFile(const std::string& f) override; + Status DeleteFile(const std::string& f) override; - virtual Status RenameFile(const std::string& s, - const std::string& t) override; + Status RenameFile(const std::string& s, const std::string& t) override; - virtual Status LinkFile(const std::string& s, const std::string& t) override; + Status LinkFile(const std::string& s, const std::string& t) override; // Undef to eliminate clash on Windows #undef GetFreeSpace - virtual Status GetFreeSpace(const std::string& path, - uint64_t* disk_free) override { + Status GetFreeSpace(const std::string& path, uint64_t* disk_free) override { if (!IsFilesystemActive() && error_.subcode() == IOStatus::SubCode::kNoSpace) { *disk_free = 0; diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc index 53bbaeb0793..0ffb43ea60d 100644 --- a/utilities/fault_injection_fs.cc +++ b/utilities/fault_injection_fs.cc @@ -33,7 +33,7 @@ namespace ROCKSDB_NAMESPACE { -const std::string kNewFileNoOverwrite = ""; +const std::string kNewFileNoOverwrite; // Assume a filename, and not a directory name like "/foo/bar/" std::string TestFSGetDirName(const std::string filename) { @@ -47,7 +47,7 @@ std::string TestFSGetDirName(const std::string filename) { // Trim the tailing "/" in the end of `str` std::string TestFSTrimDirname(const std::string& str) { - size_t found = str.find_last_not_of("/"); + size_t found = str.find_last_not_of('/'); if (found == std::string::npos) { return str; } @@ -74,7 +74,6 @@ void CalculateTypedChecksum(const ChecksumType& checksum_type, const char* data, uint32_t v = XXH32(data, size, 0); PutFixed32(checksum, v); } - return; } IOStatus FSFileState::DropUnsyncedData() { @@ -196,9 +195,10 @@ IOStatus TestFSWritableFile::Append( data.size(), &checksum); if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum && checksum != verification_info.checksum.ToString()) { - std::string msg = "Data is corrupted! Origin data checksum: " + - verification_info.checksum.ToString() + - "current data checksum: " + checksum; + std::string msg = + "Data is corrupted! Origin data checksum: " + + verification_info.checksum.ToString(true) + + "current data checksum: " + Slice(checksum).ToString(true); return IOStatus::Corruption(msg); } if (target_->use_direct_io()) { @@ -229,9 +229,10 @@ IOStatus TestFSWritableFile::PositionedAppend( data.size(), &checksum); if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum && checksum != verification_info.checksum.ToString()) { - std::string msg = "Data is corrupted! Origin data checksum: " + - verification_info.checksum.ToString() + - "current data checksum: " + checksum; + std::string msg = + "Data is corrupted! Origin data checksum: " + + verification_info.checksum.ToString(true) + + "current data checksum: " + Slice(checksum).ToString(true); return IOStatus::Corruption(msg); } target_->PositionedAppend(data, offset, options, dbg); @@ -398,6 +399,7 @@ IOStatus TestFSRandomAccessFile::Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) const { + TEST_SYNC_POINT("FaultInjectionTestFS::RandomRead"); if (!fs_->IsFilesystemActive()) { return fs_->GetError(); } @@ -415,7 +417,7 @@ IOStatus TestFSRandomAccessFile::Read(uint64_t offset, size_t n, IOStatus TestFSRandomAccessFile::ReadAsync( FSReadRequest& req, const IOOptions& opts, - std::function cb, void* cb_arg, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* /*dbg*/) { IOStatus ret; IOStatus s; @@ -457,8 +459,8 @@ IOStatus TestFSRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, } bool this_injected_error; reqs[i].status = fs_->InjectThreadSpecificReadError( - FaultInjectionTestFS::ErrorOperation::kMultiReadSingleReq, - &(reqs[i].result), use_direct_io(), reqs[i].scratch, + FaultInjectionTestFS::ErrorOperation::kRead, &(reqs[i].result), + use_direct_io(), reqs[i].scratch, /*need_count_increase=*/true, /*fault_injected=*/&this_injected_error); injected_error |= this_injected_error; @@ -917,9 +919,10 @@ IOStatus FaultInjectionTestFS::DeleteFilesCreatedAfterLastDirSync( return io_s; } } else { + IOOptions opts; IOStatus io_s = WriteStringToFile(target(), file_pair.second, - pair.first + "/" + file_pair.first, true); + pair.first + "/" + file_pair.first, true, opts); if (!io_s.ok()) { return io_s; } @@ -1012,7 +1015,7 @@ IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError( bool FaultInjectionTestFS::TryParseFileName(const std::string& file_name, uint64_t* number, FileType* type) { - std::size_t found = file_name.find_last_of("/"); + std::size_t found = file_name.find_last_of('/'); std::string file = file_name.substr(found); return ParseFileName(file, number, type); } diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h index afd770dde07..356d21f5701 100644 --- a/utilities/fault_injection_fs.h +++ b/utilities/fault_injection_fs.h @@ -63,38 +63,39 @@ class TestFSWritableFile : public FSWritableFile { std::unique_ptr&& f, FaultInjectionTestFS* fs); virtual ~TestFSWritableFile(); - virtual IOStatus Append(const Slice& data, const IOOptions&, - IODebugContext*) override; - virtual IOStatus Append(const Slice& data, const IOOptions& options, - const DataVerificationInfo& verification_info, - IODebugContext* dbg) override; - virtual IOStatus Truncate(uint64_t size, const IOOptions& options, - IODebugContext* dbg) override { + IOStatus Append(const Slice& data, const IOOptions&, + IODebugContext*) override; + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override; + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override { return target_->Truncate(size, options, dbg); } - virtual IOStatus Close(const IOOptions& options, - IODebugContext* dbg) override; - virtual IOStatus Flush(const IOOptions&, IODebugContext*) override; - virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; - virtual IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/, - const IOOptions& options, - IODebugContext* dbg) override; - virtual bool IsSyncThreadSafe() const override { return true; } - virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset, - const IOOptions& options, - IODebugContext* dbg) override { + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + IOStatus Flush(const IOOptions&, IODebugContext*) override; + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/, + const IOOptions& options, IODebugContext* dbg) override; + bool IsSyncThreadSafe() const override { return true; } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override { return target_->PositionedAppend(data, offset, options, dbg); } IOStatus PositionedAppend(const Slice& data, uint64_t offset, const IOOptions& options, const DataVerificationInfo& verification_info, IODebugContext* dbg) override; - virtual size_t GetRequiredBufferAlignment() const override { + size_t GetRequiredBufferAlignment() const override { return target_->GetRequiredBufferAlignment(); } - virtual bool use_direct_io() const override { - return target_->use_direct_io(); - }; + bool use_direct_io() const override { return target_->use_direct_io(); } + + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override { + MutexLock l(&mutex_); + return target_->GetFileSize(options, dbg); + } private: FSFileState state_; // Need protection by mutex_ @@ -124,7 +125,7 @@ class TestFSRandomRWFile : public FSRandomRWFile { size_t GetRequiredBufferAlignment() const override { return target_->GetRequiredBufferAlignment(); } - bool use_direct_io() const override { return target_->use_direct_io(); }; + bool use_direct_io() const override { return target_->use_direct_io(); } private: std::unique_ptr target_; @@ -142,7 +143,7 @@ class TestFSRandomAccessFile : public FSRandomAccessFile { Slice* result, char* scratch, IODebugContext* dbg) const override; IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, - std::function cb, + std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override; IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, @@ -181,13 +182,11 @@ class TestFSDirectory : public FSDirectory { : fs_(fs), dirname_(dirname), dir_(dir) {} ~TestFSDirectory() {} - virtual IOStatus Fsync(const IOOptions& options, - IODebugContext* dbg) override; + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; - virtual IOStatus Close(const IOOptions& options, - IODebugContext* dbg) override; + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; - virtual IOStatus FsyncWithDirOptions( + IOStatus FsyncWithDirOptions( const IOOptions& options, IODebugContext* dbg, const DirFsyncOptions& dir_fsync_options) override; @@ -244,22 +243,19 @@ class FaultInjectionTestFS : public FileSystemWrapper { std::unique_ptr* r, IODebugContext* dbg) override; - virtual IOStatus DeleteFile(const std::string& f, const IOOptions& options, - IODebugContext* dbg) override; + IOStatus DeleteFile(const std::string& f, const IOOptions& options, + IODebugContext* dbg) override; - virtual IOStatus RenameFile(const std::string& s, const std::string& t, - const IOOptions& options, - IODebugContext* dbg) override; + IOStatus RenameFile(const std::string& s, const std::string& t, + const IOOptions& options, IODebugContext* dbg) override; - virtual IOStatus LinkFile(const std::string& src, const std::string& target, - const IOOptions& options, - IODebugContext* dbg) override; + IOStatus LinkFile(const std::string& src, const std::string& target, + const IOOptions& options, IODebugContext* dbg) override; // Undef to eliminate clash on Windows #undef GetFreeSpace - virtual IOStatus GetFreeSpace(const std::string& path, - const IOOptions& options, uint64_t* disk_free, - IODebugContext* dbg) override { + IOStatus GetFreeSpace(const std::string& path, const IOOptions& options, + uint64_t* disk_free, IODebugContext* dbg) override { IOStatus io_s; if (!IsFilesystemActive() && error_.subcode() == IOStatus::SubCode::kNoSpace) { @@ -270,10 +266,10 @@ class FaultInjectionTestFS : public FileSystemWrapper { return io_s; } - virtual IOStatus Poll(std::vector& io_handles, - size_t min_completions) override; + IOStatus Poll(std::vector& io_handles, + size_t min_completions) override; - virtual IOStatus AbortIO(std::vector& io_handles) override; + IOStatus AbortIO(std::vector& io_handles) override; void WritableFileClosed(const FSFileState& state); @@ -452,8 +448,8 @@ class FaultInjectionTestFS : public FileSystemWrapper { void SetRandomReadError(int one_in) { read_error_one_in_ = one_in; } bool ShouldInjectRandomReadError() { - return read_error_one_in() && - Random::GetTLSInstance()->OneIn(read_error_one_in()); + auto one_in = read_error_one_in(); + return one_in > 0 && Random::GetTLSInstance()->OneIn(one_in); } // Inject an write error with randomlized parameter and the predefined diff --git a/utilities/fault_injection_secondary_cache.cc b/utilities/fault_injection_secondary_cache.cc index c2ea12535bc..fa93e8244d8 100644 --- a/utilities/fault_injection_secondary_cache.cc +++ b/utilities/fault_injection_secondary_cache.cc @@ -92,6 +92,7 @@ FaultInjectionSecondaryCache::Lookup(const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool wait, bool advise_erase, + Statistics* stats, bool& kept_in_sec_cache) { ErrorContext* ctx = GetErrorContext(); if (base_is_compressed_sec_cache_) { @@ -99,11 +100,12 @@ FaultInjectionSecondaryCache::Lookup(const Slice& key, return nullptr; } else { return base_->Lookup(key, helper, create_context, wait, advise_erase, - kept_in_sec_cache); + stats, kept_in_sec_cache); } } else { - std::unique_ptr hdl = base_->Lookup( - key, helper, create_context, wait, advise_erase, kept_in_sec_cache); + std::unique_ptr hdl = + base_->Lookup(key, helper, create_context, wait, advise_erase, stats, + kept_in_sec_cache); if (wait && ctx->rand.OneIn(prob_)) { hdl.reset(); } diff --git a/utilities/fault_injection_secondary_cache.h b/utilities/fault_injection_secondary_cache.h index dd73ac15630..226470c73fd 100644 --- a/utilities/fault_injection_secondary_cache.h +++ b/utilities/fault_injection_secondary_cache.h @@ -27,7 +27,7 @@ class FaultInjectionSecondaryCache : public SecondaryCache { } } - virtual ~FaultInjectionSecondaryCache() override {} + ~FaultInjectionSecondaryCache() override {} const char* Name() const override { return "FaultInjectionSecondaryCache"; } @@ -43,7 +43,7 @@ class FaultInjectionSecondaryCache : public SecondaryCache { std::unique_ptr Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool wait, bool advise_erase, - bool& kept_in_sec_cache) override; + Statistics* stats, bool& kept_in_sec_cache) override; bool SupportForceErase() const override { return base_->SupportForceErase(); } diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc index 8255a6cad79..3a64fc3fa95 100644 --- a/utilities/memory/memory_test.cc +++ b/utilities/memory/memory_test.cc @@ -65,7 +65,7 @@ class MemoryTest : public testing::Test { if (db_impl != nullptr) { ASSERT_OK(db_impl->TEST_GetAllImmutableCFOptions(&iopts_map)); } - for (auto pair : iopts_map) { + for (const auto& pair : iopts_map) { GetCachePointersFromTableFactory(pair.second->table_factory.get(), cache_set); } diff --git a/utilities/merge_operators/bytesxor.h b/utilities/merge_operators/bytesxor.h index 3c7baaccec4..fb7600d6d16 100644 --- a/utilities/merge_operators/bytesxor.h +++ b/utilities/merge_operators/bytesxor.h @@ -23,9 +23,8 @@ class BytesXOROperator : public AssociativeMergeOperator { public: // XORs the two array of bytes one byte at a time and stores the result // in new_value. len is the number of xored bytes, and the length of new_value - virtual bool Merge(const Slice& key, const Slice* existing_value, - const Slice& value, std::string* new_value, - Logger* logger) const override; + bool Merge(const Slice& key, const Slice* existing_value, const Slice& value, + std::string* new_value, Logger* logger) const override; static const char* kClassName() { return "BytesXOR"; } static const char* kNickName() { return "bytesxor"; } diff --git a/utilities/merge_operators/sortlist.cc b/utilities/merge_operators/sortlist.cc index 67bfc7e5eab..ff0063779fc 100644 --- a/utilities/merge_operators/sortlist.cc +++ b/utilities/merge_operators/sortlist.cc @@ -52,7 +52,9 @@ bool SortList::PartialMergeMulti(const Slice& /*key*/, void SortList::MakeVector(std::vector& operand, Slice slice) const { do { const char* begin = slice.data_; - while (*slice.data_ != ',' && *slice.data_) slice.data_++; + while (*slice.data_ != ',' && *slice.data_) { + slice.data_++; + } operand.push_back(std::stoi(std::string(begin, slice.data_))); } while (0 != *slice.data_++); } diff --git a/utilities/merge_operators/string_append/stringappend.cc b/utilities/merge_operators/string_append/stringappend.cc index 748e5c89f6c..4ea250c4d7f 100644 --- a/utilities/merge_operators/string_append/stringappend.cc +++ b/utilities/merge_operators/string_append/stringappend.cc @@ -7,8 +7,7 @@ #include "stringappend.h" -#include - +#include #include #include "rocksdb/merge_operator.h" diff --git a/utilities/merge_operators/string_append/stringappend.h b/utilities/merge_operators/string_append/stringappend.h index 4a7b2b9e58b..ec979149cb2 100644 --- a/utilities/merge_operators/string_append/stringappend.h +++ b/utilities/merge_operators/string_append/stringappend.h @@ -17,14 +17,13 @@ class StringAppendOperator : public AssociativeMergeOperator { explicit StringAppendOperator(char delim_char); explicit StringAppendOperator(const std::string& delim); - virtual bool Merge(const Slice& key, const Slice* existing_value, - const Slice& value, std::string* new_value, - Logger* logger) const override; + bool Merge(const Slice& key, const Slice* existing_value, const Slice& value, + std::string* new_value, Logger* logger) const override; static const char* kClassName() { return "StringAppendOperator"; } static const char* kNickName() { return "stringappend"; } - virtual const char* Name() const override { return kClassName(); } - virtual const char* NickName() const override { return kNickName(); } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } private: std::string delim_; // The delimiter is inserted between elements diff --git a/utilities/merge_operators/string_append/stringappend2.cc b/utilities/merge_operators/string_append/stringappend2.cc index bd0716cc3ca..31972402fe7 100644 --- a/utilities/merge_operators/string_append/stringappend2.cc +++ b/utilities/merge_operators/string_append/stringappend2.cc @@ -5,8 +5,7 @@ #include "stringappend2.h" -#include - +#include #include #include diff --git a/utilities/merge_operators/string_append/stringappend2.h b/utilities/merge_operators/string_append/stringappend2.h index 75389e4ae81..7d20038191d 100644 --- a/utilities/merge_operators/string_append/stringappend2.h +++ b/utilities/merge_operators/string_append/stringappend2.h @@ -26,13 +26,12 @@ class StringAppendTESTOperator : public MergeOperator { explicit StringAppendTESTOperator(char delim_char); explicit StringAppendTESTOperator(const std::string& delim); - virtual bool FullMergeV2(const MergeOperationInput& merge_in, - MergeOperationOutput* merge_out) const override; + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; - virtual bool PartialMergeMulti(const Slice& key, - const std::deque& operand_list, - std::string* new_value, - Logger* logger) const override; + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const override; static const char* kClassName() { return "StringAppendTESTOperator"; } static const char* kNickName() { return "stringappendtest"; } diff --git a/utilities/object_registry.cc b/utilities/object_registry.cc index 786f2ee2e43..105d52bf5af 100644 --- a/utilities/object_registry.cc +++ b/utilities/object_registry.cc @@ -5,7 +5,7 @@ #include "rocksdb/utilities/object_registry.h" -#include +#include #include "logging/logging.h" #include "port/lang.h" diff --git a/utilities/option_change_migration/option_change_migration_test.cc b/utilities/option_change_migration/option_change_migration_test.cc index 1cb42a0cacf..d6114f8331a 100644 --- a/utilities/option_change_migration/option_change_migration_test.cc +++ b/utilities/option_change_migration/option_change_migration_test.cc @@ -119,7 +119,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate1) { { std::unique_ptr it(db_->NewIterator(ReadOptions())); it->SeekToFirst(); - for (std::string key : keys) { + for (const std::string& key : keys) { ASSERT_TRUE(it->Valid()); ASSERT_EQ(key, it->key().ToString()); it->Next(); @@ -199,7 +199,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate2) { { std::unique_ptr it(db_->NewIterator(ReadOptions())); it->SeekToFirst(); - for (std::string key : keys) { + for (const std::string& key : keys) { ASSERT_TRUE(it->Valid()); ASSERT_EQ(key, it->key().ToString()); it->Next(); @@ -285,7 +285,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate3) { { std::unique_ptr it(db_->NewIterator(ReadOptions())); it->SeekToFirst(); - for (std::string key : keys) { + for (const std::string& key : keys) { ASSERT_TRUE(it->Valid()); ASSERT_EQ(key, it->key().ToString()); it->Next(); @@ -371,7 +371,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate4) { { std::unique_ptr it(db_->NewIterator(ReadOptions())); it->SeekToFirst(); - for (std::string key : keys) { + for (const std::string& key : keys) { ASSERT_TRUE(it->Valid()); ASSERT_EQ(key, it->key().ToString()); it->Next(); @@ -538,7 +538,7 @@ TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) { { std::unique_ptr it(db_->NewIterator(ReadOptions())); it->SeekToFirst(); - for (std::string key : keys) { + for (const std::string& key : keys) { ASSERT_TRUE(it->Valid()); ASSERT_EQ(key, it->key().ToString()); it->Next(); diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc index fd9affb0d91..5c4530e617a 100644 --- a/utilities/options/options_util_test.cc +++ b/utilities/options/options_util_test.cc @@ -57,8 +57,8 @@ TEST_F(OptionsUtilTest, SaveAndLoad) { } const std::string kFileName = "OPTIONS-123456"; - ASSERT_OK(PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName, - env_->GetFileSystem().get())); + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), db_opt, cf_names, cf_opts, + kFileName, env_->GetFileSystem().get())); DBOptions loaded_db_opt; std::vector loaded_cf_descs; @@ -121,12 +121,12 @@ TEST_F(OptionsUtilTest, SaveAndLoadWithCacheCheck) { std::vector cf_names; cf_names.push_back(kDefaultColumnFamilyName); - cf_names.push_back("cf_sample"); - cf_names.push_back("cf_plain_table_sample"); + cf_names.emplace_back("cf_sample"); + cf_names.emplace_back("cf_plain_table_sample"); // Saving DB in file const std::string kFileName = "OPTIONS-LOAD_CACHE_123456"; - ASSERT_OK(PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName, - env_->GetFileSystem().get())); + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), db_opt, cf_names, cf_opts, + kFileName, env_->GetFileSystem().get())); DBOptions loaded_db_opt; std::vector loaded_cf_descs; @@ -151,8 +151,8 @@ TEST_F(OptionsUtilTest, SaveAndLoadWithCacheCheck) { namespace { class DummyTableFactory : public TableFactory { public: - DummyTableFactory() {} - ~DummyTableFactory() override {} + DummyTableFactory() = default; + ~DummyTableFactory() override = default; const char* Name() const override { return "DummyTableFactory"; } @@ -183,8 +183,8 @@ class DummyTableFactory : public TableFactory { class DummyMergeOperator : public MergeOperator { public: - DummyMergeOperator() {} - ~DummyMergeOperator() override {} + DummyMergeOperator() = default; + ~DummyMergeOperator() override = default; bool FullMergeV2(const MergeOperationInput& /*merge_in*/, MergeOperationOutput* /*merge_out*/) const override { @@ -203,8 +203,8 @@ class DummyMergeOperator : public MergeOperator { class DummySliceTransform : public SliceTransform { public: - DummySliceTransform() {} - ~DummySliceTransform() override {} + DummySliceTransform() = default; + ~DummySliceTransform() override = default; // Return the name of this transformation. const char* Name() const override { return "DummySliceTransform"; } @@ -758,8 +758,8 @@ TEST_F(OptionsUtilTest, WalDirInOptins) { options.wal_dir = dbname_; std::string options_file; ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &options_file)); - ASSERT_OK(PersistRocksDBOptions(options, {"default"}, {options}, - dbname_ + "/" + options_file, + ASSERT_OK(PersistRocksDBOptions(WriteOptions(), options, {"default"}, + {options}, dbname_ + "/" + options_file, options.env->GetFileSystem().get())); ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); ASSERT_EQ(db_opts.wal_dir, dbname_); @@ -779,4 +779,3 @@ int main(int argc, char** argv) { #endif // GFLAGS return RUN_ALL_TESTS(); } - diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc index 3118fc2df68..864cb682ea4 100644 --- a/utilities/persistent_cache/block_cache_tier.cc +++ b/utilities/persistent_cache/block_cache_tier.cc @@ -78,7 +78,7 @@ bool IsCacheFile(const std::string& file) { // check if the file has .rc suffix // Unfortunately regex support across compilers is not even, so we use simple // string parsing - size_t pos = file.find("."); + size_t pos = file.find('.'); if (pos == std::string::npos) { return false; } @@ -97,7 +97,7 @@ Status BlockCacheTier::CleanupCacheFolder(const std::string& folder) { } // cleanup files with the patter :digi:.rc - for (auto file : files) { + for (const auto& file : files) { if (IsCacheFile(file)) { // cache file Info(opt_.log, "Removing file %s.", file.c_str()); diff --git a/utilities/persistent_cache/block_cache_tier_file.cc b/utilities/persistent_cache/block_cache_tier_file.cc index ff01c1abcf6..493b9223675 100644 --- a/utilities/persistent_cache/block_cache_tier_file.cc +++ b/utilities/persistent_cache/block_cache_tier_file.cc @@ -79,7 +79,7 @@ struct CacheRecordHeader { }; struct CacheRecord { - CacheRecord() {} + CacheRecord() = default; CacheRecord(const Slice& key, const Slice& val) : hdr_(MAGIC, static_cast(key.size()), static_cast(val.size())), diff --git a/utilities/persistent_cache/hash_table_bench.cc b/utilities/persistent_cache/hash_table_bench.cc index bf4406bb355..1951848f0ec 100644 --- a/utilities/persistent_cache/hash_table_bench.cc +++ b/utilities/persistent_cache/hash_table_bench.cc @@ -163,15 +163,15 @@ class HashTableBenchmark { // Wrapper functions for thread entry // static void WriteMain(void* args) { - reinterpret_cast(args)->RunWrite(); + static_cast(args)->RunWrite(); } static void ReadMain(void* args) { - reinterpret_cast(args)->RunRead(); + static_cast(args)->RunRead(); } static void EraseMain(void* args) { - reinterpret_cast(args)->RunErase(); + static_cast(args)->RunErase(); } HashTableImpl* impl_; // Implementation to test diff --git a/utilities/persistent_cache/hash_table_test.cc b/utilities/persistent_cache/hash_table_test.cc index faae2cf2142..7ae6a4a643d 100644 --- a/utilities/persistent_cache/hash_table_test.cc +++ b/utilities/persistent_cache/hash_table_test.cc @@ -5,8 +5,7 @@ // #include "utilities/persistent_cache/hash_table.h" -#include - +#include #include #include #include @@ -17,14 +16,13 @@ #include "util/random.h" #include "utilities/persistent_cache/hash_table_evictable.h" - namespace ROCKSDB_NAMESPACE { struct HashTableTest : public testing::Test { ~HashTableTest() override { map_.Clear(&HashTableTest::ClearNode); } struct Node { - Node() {} + Node() = default; explicit Node(const uint64_t key, const std::string& val = std::string()) : key_(key), val_(val) {} @@ -55,7 +53,7 @@ struct EvictableHashTableTest : public testing::Test { } struct Node : LRUElement { - Node() {} + Node() = default; explicit Node(const uint64_t key, const std::string& val = std::string()) : key_(key), val_(val) {} diff --git a/utilities/persistent_cache/persistent_cache_test.cc b/utilities/persistent_cache/persistent_cache_test.cc index dfbc9b93160..e6ac9caf100 100644 --- a/utilities/persistent_cache/persistent_cache_test.cc +++ b/utilities/persistent_cache/persistent_cache_test.cc @@ -255,7 +255,7 @@ std::shared_ptr MakeTieredCache( #ifdef OS_LINUX static void UniqueIdCallback(void* arg) { - int* result = reinterpret_cast(arg); + int* result = static_cast(arg); if (*result == -1) { *result = 0; } diff --git a/utilities/persistent_cache/persistent_cache_tier.cc b/utilities/persistent_cache/persistent_cache_tier.cc index 773aafbf260..cfa3722b48d 100644 --- a/utilities/persistent_cache/persistent_cache_tier.cc +++ b/utilities/persistent_cache/persistent_cache_tier.cc @@ -82,9 +82,9 @@ bool PersistentCacheTier::Erase(const Slice& /*key*/) { std::string PersistentCacheTier::PrintStats() { std::ostringstream os; - for (auto tier_stats : Stats()) { + for (const auto& tier_stats : Stats()) { os << "---- next tier -----" << std::endl; - for (auto stat : tier_stats) { + for (const auto& stat : tier_stats) { os << stat.first << ": " << stat.second << std::endl; } } diff --git a/utilities/persistent_cache/persistent_cache_tier.h b/utilities/persistent_cache/persistent_cache_tier.h index 44d2fbba31a..c42b7d5a985 100644 --- a/utilities/persistent_cache/persistent_cache_tier.h +++ b/utilities/persistent_cache/persistent_cache_tier.h @@ -253,22 +253,22 @@ class PersistentCacheTier : public PersistentCache { // Print stats to string recursively virtual std::string PrintStats(); - virtual PersistentCache::StatsType Stats() override; + PersistentCache::StatsType Stats() override; // Insert to page cache - virtual Status Insert(const Slice& page_key, const char* data, - const size_t size) override = 0; + Status Insert(const Slice& page_key, const char* data, + const size_t size) override = 0; // Lookup page cache by page identifier - virtual Status Lookup(const Slice& page_key, std::unique_ptr* data, - size_t* size) override = 0; + Status Lookup(const Slice& page_key, std::unique_ptr* data, + size_t* size) override = 0; // Does it store compressed data ? - virtual bool IsCompressed() override = 0; + bool IsCompressed() override = 0; - virtual std::string GetPrintableOptions() const override = 0; + std::string GetPrintableOptions() const override = 0; - virtual uint64_t NewId() override; + uint64_t NewId() override; // Return a reference to next tier virtual Tier& next_tier() { return next_tier_; } diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc index ff9d52dca9b..1daaeb21992 100644 --- a/utilities/simulator_cache/sim_cache.cc +++ b/utilities/simulator_cache/sim_cache.cc @@ -73,7 +73,7 @@ class CacheActivityLogger { oss << "LOOKUP - " << key.ToString(true) << std::endl; MutexLock l(&mutex_); - Status s = file_writer_->Append(oss.str()); + Status s = file_writer_->Append(IOOptions(), oss.str()); if (!s.ok() && bg_status_.ok()) { bg_status_ = s; } @@ -93,7 +93,7 @@ class CacheActivityLogger { // line format: "ADD - - " oss << "ADD - " << key.ToString(true) << " - " << size << std::endl; MutexLock l(&mutex_); - Status s = file_writer_->Append(oss.str()); + Status s = file_writer_->Append(IOOptions(), oss.str()); if (!s.ok() && bg_status_.ok()) { bg_status_ = s; } @@ -126,7 +126,7 @@ class CacheActivityLogger { } activity_logging_enabled_.store(false); - Status s = file_writer_->Close(); + Status s = file_writer_->Close(IOOptions()); if (!s.ok() && bg_status_.ok()) { bg_status_ = s; } @@ -157,7 +157,7 @@ class SimCacheImpl : public SimCache { hit_times_(0), stats_(nullptr) {} - ~SimCacheImpl() override {} + ~SimCacheImpl() override = default; const char* Name() const override { return "SimCache"; } diff --git a/utilities/simulator_cache/sim_cache_test.cc b/utilities/simulator_cache/sim_cache_test.cc index 2e37cd34792..e9e3fcd9d94 100644 --- a/utilities/simulator_cache/sim_cache_test.cc +++ b/utilities/simulator_cache/sim_cache_test.cc @@ -175,7 +175,7 @@ TEST_F(SimCacheTest, SimCacheLogging) { sim_cache->StopActivityLogging(); ASSERT_OK(sim_cache->GetActivityLoggingStatus()); - std::string file_contents = ""; + std::string file_contents; ASSERT_OK(ReadFileToString(env_, log_file, &file_contents)); std::istringstream contents(file_contents); diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector.h b/utilities/table_properties_collectors/compact_on_deletion_collector.h index c267463a02b..1ccfa7becdf 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector.h +++ b/utilities/table_properties_collectors/compact_on_deletion_collector.h @@ -18,29 +18,26 @@ class CompactOnDeletionCollector : public TablePropertiesCollector { // @params key the user key that is inserted into the table. // @params value the value that is inserted into the table. // @params file_size file size up to now - virtual Status AddUserKey(const Slice& key, const Slice& value, - EntryType type, SequenceNumber seq, - uint64_t file_size) override; + Status AddUserKey(const Slice& key, const Slice& value, EntryType type, + SequenceNumber seq, uint64_t file_size) override; // Finish() will be called when a table has already been built and is ready // for writing the properties block. // @params properties User will add their collected statistics to // `properties`. - virtual Status Finish(UserCollectedProperties* /*properties*/) override; + Status Finish(UserCollectedProperties* /*properties*/) override; // Return the human-readable properties, where the key is property name and // the value is the human-readable form of value. - virtual UserCollectedProperties GetReadableProperties() const override { + UserCollectedProperties GetReadableProperties() const override { return UserCollectedProperties(); } // The name of the properties collector can be used for debugging purpose. - virtual const char* Name() const override { - return "CompactOnDeletionCollector"; - } + const char* Name() const override { return "CompactOnDeletionCollector"; } // EXPERIMENTAL Return whether the output file should be further compacted - virtual bool NeedCompact() const override { return need_compaction_; } + bool NeedCompact() const override { return need_compaction_; } static const int kNumBuckets = 128; diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc index 5de18e26243..34a47dbf3f3 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +++ b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc @@ -7,10 +7,11 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include +#include "utilities/table_properties_collectors/compact_on_deletion_collector.h" #include #include +#include #include #include "port/stack_trace.h" @@ -19,7 +20,6 @@ #include "rocksdb/utilities/table_properties_collectors.h" #include "test_util/testharness.h" #include "util/random.h" -#include "utilities/table_properties_collectors/compact_on_deletion_collector.h" namespace ROCKSDB_NAMESPACE { diff --git a/utilities/trace/file_trace_reader_writer.cc b/utilities/trace/file_trace_reader_writer.cc index f2ca741442b..cbbada57cd7 100644 --- a/utilities/trace/file_trace_reader_writer.cc +++ b/utilities/trace/file_trace_reader_writer.cc @@ -96,7 +96,7 @@ Status FileTraceWriter::Close() { } Status FileTraceWriter::Write(const Slice& data) { - return file_writer_->Append(data); + return file_writer_->Append(IOOptions(), data); } uint64_t FileTraceWriter::GetFileSize() { return file_writer_->GetFileSize(); } diff --git a/utilities/trace/file_trace_reader_writer.h b/utilities/trace/file_trace_reader_writer.h index 65d4831083b..4535343a627 100644 --- a/utilities/trace/file_trace_reader_writer.h +++ b/utilities/trace/file_trace_reader_writer.h @@ -18,9 +18,9 @@ class FileTraceReader : public TraceReader { explicit FileTraceReader(std::unique_ptr&& reader); ~FileTraceReader(); - virtual Status Read(std::string* data) override; - virtual Status Close() override; - virtual Status Reset() override; + Status Read(std::string* data) override; + Status Close() override; + Status Reset() override; private: std::unique_ptr file_reader_; @@ -37,9 +37,9 @@ class FileTraceWriter : public TraceWriter { explicit FileTraceWriter(std::unique_ptr&& file_writer); ~FileTraceWriter(); - virtual Status Write(const Slice& data) override; - virtual Status Close() override; - virtual uint64_t GetFileSize() override; + Status Write(const Slice& data) override; + Status Close() override; + uint64_t GetFileSize() override; private: std::unique_ptr file_writer_; diff --git a/utilities/trace/replayer_impl.cc b/utilities/trace/replayer_impl.cc index 32a5ad7f051..278ec81b02e 100644 --- a/utilities/trace/replayer_impl.cc +++ b/utilities/trace/replayer_impl.cc @@ -282,8 +282,7 @@ Status ReplayerImpl::ReadTrace(Trace* trace) { } void ReplayerImpl::BackgroundWork(void* arg) { - std::unique_ptr ra( - reinterpret_cast(arg)); + std::unique_ptr ra(static_cast(arg)); assert(ra != nullptr); std::unique_ptr record; diff --git a/utilities/transactions/lock/point/point_lock_manager.cc b/utilities/transactions/lock/point/point_lock_manager.cc index b73b3fe7618..05210977783 100644 --- a/utilities/transactions/lock/point/point_lock_manager.cc +++ b/utilities/transactions/lock/point/point_lock_manager.cc @@ -34,9 +34,8 @@ struct LockInfo { txn_ids.push_back(id); } LockInfo(const LockInfo& lock_info) - : exclusive(lock_info.exclusive), - txn_ids(lock_info.txn_ids), - expiration_time(lock_info.expiration_time) {} + + = default; void operator=(const LockInfo& lock_info) { exclusive = lock_info.exclusive; txn_ids = lock_info.txn_ids; diff --git a/utilities/transactions/lock/point/point_lock_manager_test.h b/utilities/transactions/lock/point/point_lock_manager_test.h index 51d9076b272..4f0054459c9 100644 --- a/utilities/transactions/lock/point/point_lock_manager_test.h +++ b/utilities/transactions/lock/point/point_lock_manager_test.h @@ -69,7 +69,7 @@ class PointLockManagerTest : public testing::Test { PessimisticTransaction* NewTxn( TransactionOptions txn_opt = TransactionOptions()) { Transaction* txn = db_->BeginTransaction(WriteOptions(), txn_opt); - return reinterpret_cast(txn); + return static_cast(txn); } protected: diff --git a/utilities/transactions/lock/point/point_lock_tracker.h b/utilities/transactions/lock/point/point_lock_tracker.h index 57e1b8437ab..2d5dc1e687e 100644 --- a/utilities/transactions/lock/point/point_lock_tracker.h +++ b/utilities/transactions/lock/point/point_lock_tracker.h @@ -65,7 +65,7 @@ class PointLockTracker : public LockTracker { void Clear() override; - virtual LockTracker* GetTrackedLocksSinceSavePoint( + LockTracker* GetTrackedLocksSinceSavePoint( const LockTracker& save_point_tracker) const override; PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id, diff --git a/utilities/transactions/lock/range/range_locking_test.cc b/utilities/transactions/lock/range/range_locking_test.cc index 9c044910dfb..eda9fa2697b 100644 --- a/utilities/transactions/lock/range/range_locking_test.cc +++ b/utilities/transactions/lock/range/range_locking_test.cc @@ -60,7 +60,7 @@ class RangeLockingTest : public ::testing::Test { PessimisticTransaction* NewTxn( TransactionOptions txn_opt = TransactionOptions()) { Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opt); - return reinterpret_cast(txn); + return static_cast(txn); } }; diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc index bec464e48e4..a046f2e3098 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc @@ -260,7 +260,7 @@ void locktree_manager::run_escalation(void) { static void run(void *extra) { locktree_manager *mgr = (locktree_manager *)extra; mgr->escalate_all_locktrees(); - }; + } }; m_escalator.run(this, escalation_fn::run, this); } diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc index eb56b20adfb..0cbd9e19833 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc @@ -158,7 +158,7 @@ void range_buffer::iterator::reset_current_chunk() { bool range_buffer::iterator::current(record *rec) { if (_current_chunk_offset < _current_chunk_max) { - const char *buf = reinterpret_cast(_current_chunk_base); + const char *buf = static_cast(_current_chunk_base); rec->deserialize(buf + _current_chunk_offset); _current_rec_size = rec->size(); return true; @@ -221,7 +221,7 @@ void range_buffer::append_range(const DBT *left_key, const DBT *right_key, bool is_exclusive) { size_t record_length = sizeof(record_header) + left_key->size + right_key->size; - char *buf = reinterpret_cast(_arena.malloc_from_arena(record_length)); + char *buf = static_cast(_arena.malloc_from_arena(record_length)); record_header h; h.init(left_key, right_key, is_exclusive); @@ -244,7 +244,7 @@ void range_buffer::append_range(const DBT *left_key, const DBT *right_key, void range_buffer::append_point(const DBT *key, bool is_exclusive) { size_t record_length = sizeof(record_header) + key->size; - char *buf = reinterpret_cast(_arena.malloc_from_arena(record_length)); + char *buf = static_cast(_arena.malloc_from_arena(record_length)); record_header h; h.init(key, nullptr, is_exclusive); diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc index df0073e0d29..7f0ed3abb89 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc @@ -91,7 +91,7 @@ void wfg::add_edge(TXNID a_txnid, TXNID b_txnid) { // Return false otherwise. bool wfg::node_exists(TXNID txnid) { node *n = find_node(txnid); - return n != NULL; + return n != nullptr; } bool wfg::cycle_exists_from_node(node *target, node *head, diff --git a/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc b/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc index 6dc86cc999a..b1fb736a363 100644 --- a/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc +++ b/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc @@ -7,7 +7,7 @@ /* This is a dump ground to make Lock Tree work without the rest of TokuDB. */ -#include +#include #include "db.h" #include "ft/ft-status.h" @@ -53,7 +53,9 @@ size_t toku_memory_footprint(void *, size_t touched) { return touched; } // "TOKU" LTM_STATUS_S ltm_status; void LTM_STATUS_S::init() { - if (m_initialized) return; + if (m_initialized) { + return; + } #define LTM_STATUS_INIT(k, c, t, l) \ TOKUFT_STATUS_INIT((*this), k, c, t, "locktree: " l, \ TOKU_ENGINE_STATUS | TOKU_GLOBAL_STATUS) @@ -104,7 +106,9 @@ void LTM_STATUS_S::init() { #undef LTM_STATUS_INIT } void LTM_STATUS_S::destroy() { - if (!m_initialized) return; + if (!m_initialized) { + return; + } for (int i = 0; i < LTM_STATUS_NUM_ROWS; ++i) { if (status[i].type == STATUS_PARCOUNT) { // PORT: TODO?? destroy_partitioned_counter(status[i].value.parcount); diff --git a/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc b/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc index 4ea6249564c..448e9bf51d1 100644 --- a/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc +++ b/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc @@ -33,6 +33,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ @@ -51,7 +65,7 @@ DBT *toku_init_dbt(DBT *dbt) { } DBT toku_empty_dbt(void) { - static const DBT empty_dbt = {.data = 0, .size = 0, .ulen = 0, .flags = 0}; + static const DBT empty_dbt = {.data = nullptr, .size = 0, .ulen = 0, .flags = 0}; return empty_dbt; } @@ -104,13 +118,13 @@ void toku_sdbt_cleanup(struct simple_dbt *sdbt) { const DBT *toku_dbt_positive_infinity(void) { static DBT positive_infinity_dbt = { - .data = 0, .size = 0, .ulen = 0, .flags = 0}; // port + .data = nullptr, .size = 0, .ulen = 0, .flags = 0}; // port return &positive_infinity_dbt; } const DBT *toku_dbt_negative_infinity(void) { static DBT negative_infinity_dbt = { - .data = 0, .size = 0, .ulen = 0, .flags = 0}; // port + .data = nullptr, .size = 0, .ulen = 0, .flags = 0}; // port return &negative_infinity_dbt; } diff --git a/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h b/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h index f20eeedf2fb..53ca5aab162 100644 --- a/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h +++ b/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h @@ -123,7 +123,7 @@ void partitioned_counters_destroy(void); // Effect: Destroy any partitioned counters data structures. #if defined(__cplusplus) -}; +} #endif #if 0 diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc index 65ca91b0bab..584d9ebc276 100644 --- a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc @@ -149,7 +149,7 @@ Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn, // the lock waits that are in progress. void wait_callback_for_locktree(void*, toku::lock_wait_infos* infos) { TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:EnterWaitingTxn"); - for (auto wait_info : *infos) { + for (const auto& wait_info : *infos) { // As long as we hold the lock on the locktree's pending request queue // this should be safe. auto txn = (PessimisticTransaction*)wait_info.waiter; @@ -305,7 +305,7 @@ std::vector RangeTreeLockManager::GetDeadlockInfoBuffer() { path.push_back( {it2->m_txn_id, it2->m_cf_id, it2->m_exclusive, it2->m_start.slice}); } - res.push_back(DeadlockPath(path, it->deadlock_time)); + res.emplace_back(path, it->deadlock_time); } return res; } @@ -489,7 +489,7 @@ LockManager::RangeLockStatus RangeTreeLockManager::GetRangeLockStatus() { LockManager::RangeLockStatus data; { InstrumentedMutexLock l(<ree_map_mutex_); - for (auto it : ltree_map_) { + for (const auto& it : ltree_map_) { LOCK_PRINT_CONTEXT ctx = {&data, it.first}; it.second->dump_locks((void*)&ctx, push_into_lock_status_data); } diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc b/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc index 5bfb8633767..84573325342 100644 --- a/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc +++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc @@ -12,7 +12,9 @@ namespace ROCKSDB_NAMESPACE { RangeLockList *RangeTreeLockTracker::getOrCreateList() { - if (range_list_) return range_list_.get(); + if (range_list_) { + return range_list_.get(); + } // Doesn't exist, create range_list_.reset(new RangeLockList()); @@ -103,7 +105,7 @@ void RangeLockList::ReleaseLocks(RangeTreeLockManager *mgr, releasing_locks_.store(true); } - for (auto it : buffers_) { + for (const auto &it : buffers_) { // Don't try to call release_locks() if the buffer is empty! if we are // not holding any locks, the lock tree might be in the STO-mode with // another transaction, and our attempt to release an empty set of locks diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h b/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h index 4ef48d25271..b80c0ed9db5 100644 --- a/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h +++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h @@ -94,7 +94,7 @@ class RangeTreeLockTracker : public LockTracker { void Clear() override; // "If this method is not supported, returns nullptr." - virtual LockTracker* GetTrackedLocksSinceSavePoint( + LockTracker* GetTrackedLocksSinceSavePoint( const LockTracker&) const override { return nullptr; } diff --git a/utilities/transactions/optimistic_transaction.cc b/utilities/transactions/optimistic_transaction.cc index e8506f28161..bbd99575fb3 100644 --- a/utilities/transactions/optimistic_transaction.cc +++ b/utilities/transactions/optimistic_transaction.cc @@ -19,7 +19,6 @@ #include "util/defer.h" #include "util/string_util.h" #include "utilities/transactions/lock/point/point_lock_tracker.h" -#include "utilities/transactions/optimistic_transaction.h" #include "utilities/transactions/optimistic_transaction_db_impl.h" #include "utilities/transactions/transaction_util.h" @@ -50,7 +49,7 @@ void OptimisticTransaction::Reinitialize( Initialize(txn_options); } -OptimisticTransaction::~OptimisticTransaction() {} +OptimisticTransaction::~OptimisticTransaction() = default; void OptimisticTransaction::Clear() { TransactionBaseImpl::Clear(); } diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc index 30efa86aafb..817cbdd688e 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.cc +++ b/utilities/transactions/optimistic_transaction_db_impl.cc @@ -43,8 +43,7 @@ Status OptimisticTransactionDB::Open(const Options& options, DBOptions db_options(options); ColumnFamilyOptions cf_options(options); std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); std::vector handles; Status s = Open(db_options, dbname, column_families, &handles, dbptr); if (s.ok()) { @@ -104,7 +103,7 @@ void OptimisticTransactionDBImpl::ReinitializeTransaction( Transaction* txn, const WriteOptions& write_options, const OptimisticTransactionOptions& txn_options) { assert(dynamic_cast(txn) != nullptr); - auto txn_impl = reinterpret_cast(txn); + auto txn_impl = static_cast(txn); txn_impl->Reinitialize(this, write_options, txn_options); } diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h index 7bc718e9bdc..86213832dde 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.h +++ b/utilities/transactions/optimistic_transaction_db_impl.h @@ -74,15 +74,14 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB { // Transactional `DeleteRange()` is not yet supported. using StackableDB::DeleteRange; - virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, - const Slice&, const Slice&) override { + Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, const Slice&, + const Slice&) override { return Status::NotSupported(); } // Range deletions also must not be snuck into `WriteBatch`es as they are // incompatible with `OptimisticTransactionDB`. - virtual Status Write(const WriteOptions& write_opts, - WriteBatch* batch) override { + Status Write(const WriteOptions& write_opts, WriteBatch* batch) override { if (batch->HasDeleteRange()) { return Status::NotSupported(); } diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index 73349418045..eb3511be592 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -62,8 +62,7 @@ class OptimisticTransactionTest ColumnFamilyOptions cf_options(options); std::vector column_families; std::vector handles; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); OptimisticTransactionDB* raw_txn_db = nullptr; Status s = OptimisticTransactionDB::Open( options, occ_opts, dbname, column_families, &handles, &raw_txn_db); @@ -654,13 +653,10 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) { // open DB with three column families std::vector column_families; // have to open default column family - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + column_families.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); // open the new column families - column_families.push_back( - ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); - column_families.push_back( - ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + column_families.emplace_back("CFA", ColumnFamilyOptions()); + column_families.emplace_back("CFB", ColumnFamilyOptions()); std::vector handles; OptimisticTransactionDB* raw_txn_db = nullptr; ASSERT_OK(OptimisticTransactionDB::Open( @@ -723,7 +719,6 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) { ASSERT_TRUE(s.IsBusy()); s = txn_db->Get(read_options, handles[1], "AAAZZZ", &value); ASSERT_TRUE(s.IsNotFound()); - ASSERT_EQ(value, "barbar"); delete txn; delete txn2; @@ -1221,7 +1216,8 @@ TEST_P(OptimisticTransactionTest, IteratorTest) { ASSERT_TRUE(iter->Valid()); ASSERT_EQ(results[i], iter->value().ToString()); - ASSERT_OK(txn->GetForUpdate(read_options, iter->key(), nullptr)); + ASSERT_OK( + txn->GetForUpdate(read_options, iter->key(), (std::string*)nullptr)); iter->Next(); } diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc index 1e870190e39..4e798de2a45 100644 --- a/utilities/transactions/pessimistic_transaction.cc +++ b/utilities/transactions/pessimistic_transaction.cc @@ -190,11 +190,11 @@ inline Status WriteCommittedTxn::GetForUpdateImpl( } } - if (!do_validate) { + if (!do_validate && kMaxTxnTimestamp != read_timestamp_) { return Status::InvalidArgument( "If do_validate is false then GetForUpdate with read_timestamp is not " "defined."); - } else if (kMaxTxnTimestamp == read_timestamp_) { + } else if (do_validate && kMaxTxnTimestamp == read_timestamp_) { return Status::InvalidArgument("read_timestamp must be set for validation"); } @@ -547,9 +547,8 @@ Status WriteCommittedTxn::PrepareInternal() { : db_(db), two_write_queues_(two_write_queues) { (void)two_write_queues_; // to silence unused private field warning } - virtual Status Callback(SequenceNumber, bool is_mem_disabled, - uint64_t log_number, size_t /*index*/, - size_t /*total*/) override { + Status Callback(SequenceNumber, bool is_mem_disabled, uint64_t log_number, + size_t /*index*/, size_t /*total*/) override { #ifdef NDEBUG (void)is_mem_disabled; #endif @@ -688,7 +687,7 @@ Status WriteCommittedTxn::CommitWithoutPrepareInternal() { const Comparator* ucmp = WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf); return ucmp ? ucmp->timestamp_size() - : std::numeric_limits::max(); + : std::numeric_limits::max(); }); if (!s.ok()) { return s; @@ -763,7 +762,7 @@ Status WriteCommittedTxn::CommitInternal() { const Comparator* ucmp = WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf); return ucmp ? ucmp->timestamp_size() - : std::numeric_limits::max(); + : std::numeric_limits::max(); }); } } @@ -885,7 +884,7 @@ Status PessimisticTransaction::LockBatch(WriteBatch* batch, // what the sorting is as long as it's consistent. std::map> keys_; - Handler() {} + Handler() = default; void RecordKey(uint32_t column_family_id, const Slice& key) { auto& cfh_keys = keys_[column_family_id]; diff --git a/utilities/transactions/pessimistic_transaction.h b/utilities/transactions/pessimistic_transaction.h index bb12266ec4b..5f8942f4a42 100644 --- a/utilities/transactions/pessimistic_transaction.h +++ b/utilities/transactions/pessimistic_transaction.h @@ -115,13 +115,12 @@ class PessimisticTransaction : public TransactionBaseImpl { int64_t GetDeadlockDetectDepth() const { return deadlock_detect_depth_; } - virtual Status GetRangeLock(ColumnFamilyHandle* column_family, - const Endpoint& start_key, - const Endpoint& end_key) override; + Status GetRangeLock(ColumnFamilyHandle* column_family, + const Endpoint& start_key, + const Endpoint& end_key) override; - virtual Status CollapseKey( - const ReadOptions& options, const Slice& key, - ColumnFamilyHandle* column_family = nullptr) override; + Status CollapseKey(const ReadOptions& options, const Slice& key, + ColumnFamilyHandle* column_family = nullptr) override; protected: // Refer to diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index 8009bef197b..57c14b5f7b7 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -134,6 +134,7 @@ Status PessimisticTransactionDB::Initialize( assert(batch_info.log_number_); assert(recovered_trx->name_.length()); + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions w_options; w_options.sync = true; TransactionOptions t_options; @@ -203,8 +204,7 @@ Status TransactionDB::Open(const Options& options, DBOptions db_options(options); ColumnFamilyOptions cf_options(options); std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); std::vector handles; Status s = TransactionDB::Open(db_options, txn_db_options, dbname, column_families, &handles, dbptr); diff --git a/utilities/transactions/pessimistic_transaction_db.h b/utilities/transactions/pessimistic_transaction_db.h index b662048bd49..a125c6c3562 100644 --- a/utilities/transactions/pessimistic_transaction_db.h +++ b/utilities/transactions/pessimistic_transaction_db.h @@ -36,7 +36,7 @@ class PessimisticTransactionDB : public TransactionDB { virtual ~PessimisticTransactionDB(); - virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); } + const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); } virtual Status Initialize( const std::vector& compaction_enabled_cf_indices, @@ -47,27 +47,24 @@ class PessimisticTransactionDB : public TransactionDB { Transaction* old_txn) override = 0; using StackableDB::Put; - virtual Status Put(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& val) override; + Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) override; using StackableDB::Delete; - virtual Status Delete(const WriteOptions& wopts, - ColumnFamilyHandle* column_family, - const Slice& key) override; + Status Delete(const WriteOptions& wopts, ColumnFamilyHandle* column_family, + const Slice& key) override; using StackableDB::SingleDelete; - virtual Status SingleDelete(const WriteOptions& wopts, - ColumnFamilyHandle* column_family, - const Slice& key) override; + Status SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override; using StackableDB::Merge; - virtual Status Merge(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override; + Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) override; using TransactionDB::Write; - virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + Status Write(const WriteOptions& opts, WriteBatch* updates) override; inline Status WriteWithConcurrencyControl(const WriteOptions& opts, WriteBatch* updates) { Status s; @@ -96,9 +93,9 @@ class PessimisticTransactionDB : public TransactionDB { } using StackableDB::CreateColumnFamily; - virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, - const std::string& column_family_name, - ColumnFamilyHandle** handle) override; + Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override; Status CreateColumnFamilies( const ColumnFamilyOptions& options, @@ -110,7 +107,7 @@ class PessimisticTransactionDB : public TransactionDB { std::vector* handles) override; using StackableDB::DropColumnFamily; - virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; + Status DropColumnFamily(ColumnFamilyHandle* column_family) override; Status DropColumnFamilies( const std::vector& column_families) override; @@ -254,10 +251,10 @@ class WriteCommittedTxnDB : public PessimisticTransactionDB { // Optimized version of ::Write that makes use of skip_concurrency_control // hint using TransactionDB::Write; - virtual Status Write(const WriteOptions& opts, - const TransactionDBWriteOptimizations& optimizations, - WriteBatch* updates) override; - virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + Status Write(const WriteOptions& opts, + const TransactionDBWriteOptimizations& optimizations, + WriteBatch* updates) override; + Status Write(const WriteOptions& opts, WriteBatch* updates) override; }; inline Status PessimisticTransactionDB::FailIfBatchHasTs( diff --git a/utilities/transactions/snapshot_checker.cc b/utilities/transactions/snapshot_checker.cc index da363a12fbf..fa94502cb98 100644 --- a/utilities/transactions/snapshot_checker.cc +++ b/utilities/transactions/snapshot_checker.cc @@ -14,7 +14,7 @@ namespace ROCKSDB_NAMESPACE { WritePreparedSnapshotChecker::WritePreparedSnapshotChecker( WritePreparedTxnDB* txn_db) - : txn_db_(txn_db){}; + : txn_db_(txn_db){} SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot( SequenceNumber sequence, SequenceNumber snapshot_sequence) const { diff --git a/utilities/transactions/timestamped_snapshot_test.cc b/utilities/transactions/timestamped_snapshot_test.cc index 9681b0157ad..1ca265aa153 100644 --- a/utilities/transactions/timestamped_snapshot_test.cc +++ b/utilities/transactions/timestamped_snapshot_test.cc @@ -27,7 +27,7 @@ class TsCheckingTxnNotifier : public TransactionNotifier { public: explicit TsCheckingTxnNotifier() = default; - ~TsCheckingTxnNotifier() override {} + ~TsCheckingTxnNotifier() override = default; void SnapshotCreated(const Snapshot* new_snapshot) override { assert(new_snapshot); diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h index be363b473a5..7dcf412cb88 100644 --- a/utilities/transactions/transaction_base.h +++ b/utilities/transactions/transaction_base.h @@ -84,6 +84,13 @@ class TransactionBaseImpl : public Transaction { exclusive, do_validate); } + Status GetForUpdate(const ReadOptions& options, const Slice& key, + PinnableSlice* pinnable_val, bool exclusive, + const bool do_validate) override { + return GetForUpdate(options, db_->DefaultColumnFamily(), key, pinnable_val, + exclusive, do_validate); + } + using Transaction::MultiGet; std::vector MultiGet( const ReadOptions& _read_options, @@ -201,7 +208,7 @@ class TransactionBaseImpl : public Transaction { WriteBatchWithIndex* GetWriteBatch() override; - virtual void SetLockTimeout(int64_t /*timeout*/) override { /* Do nothing */ + void SetLockTimeout(int64_t /*timeout*/) override { /* Do nothing */ } const Snapshot* GetSnapshot() const override { @@ -213,7 +220,7 @@ class TransactionBaseImpl : public Transaction { return snapshot_; } - virtual void SetSnapshot() override; + void SetSnapshot() override; void SetSnapshotOnNextOperation( std::shared_ptr notifier = nullptr) override; @@ -243,7 +250,7 @@ class TransactionBaseImpl : public Transaction { const Slice& key) override; void UndoGetForUpdate(const Slice& key) override { return UndoGetForUpdate(nullptr, key); - }; + } WriteOptions* GetWriteOptions() override { return &write_options_; } @@ -256,7 +263,7 @@ class TransactionBaseImpl : public Transaction { // iterates over the given batch and makes the appropriate inserts. // used for rebuilding prepared transactions after recovery. - virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override; + Status RebuildFromWriteBatch(WriteBatch* src_batch) override; WriteBatch* GetCommitTimeWriteBatch() override; @@ -266,9 +273,8 @@ class TransactionBaseImpl : public Transaction { Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) override; - virtual Status GetImpl(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override; + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; // Add a key to the list of tracked keys. // diff --git a/utilities/transactions/transaction_db_mutex_impl.cc b/utilities/transactions/transaction_db_mutex_impl.cc index 52c299b376a..9f26b20911c 100644 --- a/utilities/transactions/transaction_db_mutex_impl.cc +++ b/utilities/transactions/transaction_db_mutex_impl.cc @@ -17,8 +17,8 @@ namespace ROCKSDB_NAMESPACE { class TransactionDBMutexImpl : public TransactionDBMutex { public: - TransactionDBMutexImpl() {} - ~TransactionDBMutexImpl() override {} + TransactionDBMutexImpl() = default; + ~TransactionDBMutexImpl() override = default; Status Lock() override; @@ -34,8 +34,8 @@ class TransactionDBMutexImpl : public TransactionDBMutex { class TransactionDBCondVarImpl : public TransactionDBCondVar { public: - TransactionDBCondVarImpl() {} - ~TransactionDBCondVarImpl() override {} + TransactionDBCondVarImpl() = default; + ~TransactionDBCondVarImpl() override = default; Status Wait(std::shared_ptr mutex) override; @@ -91,7 +91,7 @@ Status TransactionDBMutexImpl::TryLockFor(int64_t timeout_time) { Status TransactionDBCondVarImpl::Wait( std::shared_ptr mutex) { - auto mutex_impl = reinterpret_cast(mutex.get()); + auto mutex_impl = static_cast(mutex.get()); std::unique_lock lock(mutex_impl->mutex_, std::adopt_lock); cv_.wait(lock); @@ -106,7 +106,7 @@ Status TransactionDBCondVarImpl::WaitFor( std::shared_ptr mutex, int64_t timeout_time) { Status s; - auto mutex_impl = reinterpret_cast(mutex.get()); + auto mutex_impl = static_cast(mutex.get()); std::unique_lock lock(mutex_impl->mutex_, std::adopt_lock); if (timeout_time < 0) { diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index d12626ca8c5..a2fa9ce051c 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -208,7 +208,7 @@ TEST_P(TransactionTest, DoubleEmptyWrite) { ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a"))); ASSERT_OK(txn0->Prepare()); delete txn0; - reinterpret_cast(db)->TEST_Crash(); + static_cast(db)->TEST_Crash(); ASSERT_OK(ReOpenNoDelete()); assert(db != nullptr); txn0 = db->GetTransactionByName("xid2"); @@ -250,6 +250,42 @@ TEST_P(TransactionTest, SuccessTest) { delete txn; } +// Test the basic API of the pinnable slice overload of GetForUpdate() +TEST_P(TransactionTest, SuccessTestPinnable) { + ASSERT_OK(db->ResetStats()); + + WriteOptions write_options; + ReadOptions read_options; + PinnableSlice pinnable_val; + + ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = db->BeginTransaction(write_options, TransactionOptions()); + ASSERT_TRUE(txn); + + ASSERT_EQ(0, txn->GetNumPuts()); + ASSERT_LE(0, txn->GetID()); + + ASSERT_OK(txn->GetForUpdate(read_options, "foo", &pinnable_val)); + ASSERT_EQ(*pinnable_val.GetSelf(), std::string("bar")); + + ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2"))); + + ASSERT_EQ(1, txn->GetNumPuts()); + + ASSERT_OK(txn->GetForUpdate(read_options, "foo", &pinnable_val)); + ASSERT_EQ(*pinnable_val.GetSelf(), std::string("bar2")); + + ASSERT_OK(txn->Commit()); + + ASSERT_OK( + db->Get(read_options, db->DefaultColumnFamily(), "foo", &pinnable_val)); + ASSERT_EQ(*pinnable_val.GetSelf(), std::string("bar2")); + + delete txn; +} + TEST_P(TransactionTest, SwitchMemtableDuringPrepareAndCommit_WC) { const TxnDBWritePolicy write_policy = std::get<2>(GetParam()); @@ -270,7 +306,7 @@ TEST_P(TransactionTest, SwitchMemtableDuringPrepareAndCommit_WC) { SyncPoint::GetInstance()->SetCallBack( "FlushJob::WriteLevel0Table", [&](void* arg) { // db mutex not held. - auto* mems = reinterpret_cast*>(arg); + auto* mems = static_cast*>(arg); assert(mems); ASSERT_EQ(1, mems->size()); auto* ctwb = txn->GetCommitTimeWriteBatch(); @@ -543,13 +579,16 @@ TEST_P(TransactionTest, SharedLocks) { ASSERT_TRUE(txn3); // Test shared access between txns - s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + s = txn1->GetForUpdate(read_options, "foo", (std::string*)nullptr, + false /* exclusive */); ASSERT_OK(s); - s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + s = txn2->GetForUpdate(read_options, "foo", (std::string*)nullptr, + false /* exclusive */); ASSERT_OK(s); - s = txn3->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + s = txn3->GetForUpdate(read_options, "foo", (std::string*)nullptr, + false /* exclusive */); ASSERT_OK(s); auto lock_data = db->GetLockStatusData(); @@ -572,23 +611,25 @@ TEST_P(TransactionTest, SharedLocks) { ASSERT_OK(txn3->Rollback()); // Test txn1 and txn2 sharing a lock and txn3 trying to obtain it. - s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + s = txn1->GetForUpdate(read_options, "foo", (std::string*)nullptr, + false /* exclusive */); ASSERT_OK(s); - s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + s = txn2->GetForUpdate(read_options, "foo", (std::string*)nullptr, + false /* exclusive */); ASSERT_OK(s); - s = txn3->GetForUpdate(read_options, "foo", nullptr); + s = txn3->GetForUpdate(read_options, "foo", (std::string*)nullptr); ASSERT_TRUE(s.IsTimedOut()); ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); txn1->UndoGetForUpdate("foo"); - s = txn3->GetForUpdate(read_options, "foo", nullptr); + s = txn3->GetForUpdate(read_options, "foo", (std::string*)nullptr); ASSERT_TRUE(s.IsTimedOut()); ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); txn2->UndoGetForUpdate("foo"); - s = txn3->GetForUpdate(read_options, "foo", nullptr); + s = txn3->GetForUpdate(read_options, "foo", (std::string*)nullptr); ASSERT_OK(s); ASSERT_OK(txn1->Rollback()); @@ -596,36 +637,42 @@ TEST_P(TransactionTest, SharedLocks) { ASSERT_OK(txn3->Rollback()); // Test txn1 and txn2 sharing a lock and txn2 trying to upgrade lock. - s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + s = txn1->GetForUpdate(read_options, "foo", (std::string*)nullptr, + false /* exclusive */); ASSERT_OK(s); - s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + s = txn2->GetForUpdate(read_options, "foo", (std::string*)nullptr, + false /* exclusive */); ASSERT_OK(s); - s = txn2->GetForUpdate(read_options, "foo", nullptr); + s = txn2->GetForUpdate(read_options, "foo", (std::string*)nullptr); ASSERT_TRUE(s.IsTimedOut()); ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); txn1->UndoGetForUpdate("foo"); - s = txn2->GetForUpdate(read_options, "foo", nullptr); + s = txn2->GetForUpdate(read_options, "foo", (std::string*)nullptr); ASSERT_OK(s); ASSERT_OK(txn1->Rollback()); ASSERT_OK(txn2->Rollback()); // Test txn1 trying to downgrade its lock. - s = txn1->GetForUpdate(read_options, "foo", nullptr, true /* exclusive */); + s = txn1->GetForUpdate(read_options, "foo", (std::string*)nullptr, + true /* exclusive */); ASSERT_OK(s); - s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + s = txn2->GetForUpdate(read_options, "foo", (std::string*)nullptr, + false /* exclusive */); ASSERT_TRUE(s.IsTimedOut()); ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); // Should still fail after "downgrading". - s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + s = txn1->GetForUpdate(read_options, "foo", (std::string*)nullptr, + false /* exclusive */); ASSERT_OK(s); - s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + s = txn2->GetForUpdate(read_options, "foo", (std::string*)nullptr, + false /* exclusive */); ASSERT_TRUE(s.IsTimedOut()); ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); @@ -634,15 +681,17 @@ TEST_P(TransactionTest, SharedLocks) { // Test txn1 holding an exclusive lock and txn2 trying to obtain shared // access. - s = txn1->GetForUpdate(read_options, "foo", nullptr); + s = txn1->GetForUpdate(read_options, "foo", (std::string*)nullptr); ASSERT_OK(s); - s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + s = txn2->GetForUpdate(read_options, "foo", (std::string*)nullptr, + false /* exclusive */); ASSERT_TRUE(s.IsTimedOut()); ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); txn1->UndoGetForUpdate("foo"); - s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + s = txn2->GetForUpdate(read_options, "foo", (std::string*)nullptr, + false /* exclusive */); ASSERT_OK(s); delete txn1; @@ -676,8 +725,9 @@ TEST_P(TransactionTest, DeadlockCycleShared) { for (uint32_t i = 0; i < 31; i++) { txns[i] = db->BeginTransaction(write_options, txn_options); ASSERT_TRUE(txns[i]); - auto s = txns[i]->GetForUpdate(read_options, std::to_string((i + 1) / 2), - nullptr, false /* exclusive */); + auto s = + txns[i]->GetForUpdate(read_options, std::to_string((i + 1) / 2), + (std::string*)nullptr, false /* exclusive */); ASSERT_OK(s); } @@ -691,8 +741,9 @@ TEST_P(TransactionTest, DeadlockCycleShared) { std::vector threads; for (uint32_t i = 0; i < 15; i++) { std::function blocking_thread = [&, i] { - auto s = txns[i]->GetForUpdate(read_options, std::to_string(i + 1), - nullptr, true /* exclusive */); + auto s = + txns[i]->GetForUpdate(read_options, std::to_string(i + 1), + (std::string*)nullptr, true /* exclusive */); ASSERT_OK(s); ASSERT_OK(txns[i]->Rollback()); delete txns[i]; @@ -710,8 +761,8 @@ TEST_P(TransactionTest, DeadlockCycleShared) { // Complete the cycle T[16 - 31] -> T1 for (uint32_t i = 15; i < 31; i++) { - auto s = - txns[i]->GetForUpdate(read_options, "0", nullptr, true /* exclusive */); + auto s = txns[i]->GetForUpdate(read_options, "0", (std::string*)nullptr, + true /* exclusive */); ASSERT_TRUE(s.IsDeadlock()); // Calculate next buffer len, plateau at 5 when 5 records are inserted. @@ -810,8 +861,8 @@ TEST_P(TransactionTest, DeadlockCycleShared) { for (uint32_t i = 0; i < 2; i++) { txns_shared[i] = db->BeginTransaction(write_options, txn_options); ASSERT_TRUE(txns_shared[i]); - auto s = - txns_shared[i]->GetForUpdate(read_options, std::to_string(i), nullptr); + auto s = txns_shared[i]->GetForUpdate(read_options, std::to_string(i), + (std::string*)nullptr); ASSERT_OK(s); } @@ -825,7 +876,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) { for (uint32_t i = 0; i < 1; i++) { std::function blocking_thread = [&, i] { auto s = txns_shared[i]->GetForUpdate(read_options, std::to_string(i + 1), - nullptr); + (std::string*)nullptr); ASSERT_OK(s); ASSERT_OK(txns_shared[i]->Rollback()); delete txns_shared[i]; @@ -842,7 +893,8 @@ TEST_P(TransactionTest, DeadlockCycleShared) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); // Complete the cycle T2 -> T1 with a shared lock. - auto s = txns_shared[1]->GetForUpdate(read_options, "0", nullptr, false); + auto s = txns_shared[1]->GetForUpdate(read_options, "0", + (std::string*)nullptr, false); ASSERT_TRUE(s.IsDeadlock()); auto dlock_buffer = db->GetDeadlockInfoBuffer(); @@ -884,7 +936,8 @@ TEST_P(TransactionStressTest, DeadlockCycle) { for (uint32_t i = 0; i < len; i++) { txns[i] = db->BeginTransaction(write_options, txn_options); ASSERT_TRUE(txns[i]); - auto s = txns[i]->GetForUpdate(read_options, std::to_string(i), nullptr); + auto s = txns[i]->GetForUpdate(read_options, std::to_string(i), + (std::string*)nullptr); ASSERT_OK(s); } @@ -899,8 +952,8 @@ TEST_P(TransactionStressTest, DeadlockCycle) { std::vector threads; for (uint32_t i = 0; i + 1 < len; i++) { std::function blocking_thread = [&, i] { - auto s = - txns[i]->GetForUpdate(read_options, std::to_string(i + 1), nullptr); + auto s = txns[i]->GetForUpdate(read_options, std::to_string(i + 1), + (std::string*)nullptr); ASSERT_OK(s); ASSERT_OK(txns[i]->Rollback()); delete txns[i]; @@ -917,7 +970,8 @@ TEST_P(TransactionStressTest, DeadlockCycle) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); // Complete the cycle Tlen -> T1 - auto s = txns[len - 1]->GetForUpdate(read_options, "0", nullptr); + auto s = + txns[len - 1]->GetForUpdate(read_options, "0", (std::string*)nullptr); ASSERT_TRUE(s.IsDeadlock()); const uint32_t dlock_buffer_size_ = (len - 1 > 5) ? 5 : (len - 1); @@ -1004,8 +1058,8 @@ TEST_P(TransactionStressTest, DeadlockStress) { // Lock keys in random order. for (const auto& k : random_keys) { // Lock mostly for shared access, but exclusive 1/4 of the time. - auto s = - txn->GetForUpdate(read_options, k, nullptr, txn->GetID() % 4 == 0); + auto s = txn->GetForUpdate(read_options, k, (std::string*)nullptr, + txn->GetID() % 4 == 0); if (!s.ok()) { ASSERT_TRUE(s.IsDeadlock()); ASSERT_OK(txn->Rollback()); @@ -1505,7 +1559,7 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) { ASSERT_OK(db->FlushWAL(false)); delete txn; // kill and reopen - reinterpret_cast(db)->TEST_Crash(); + static_cast(db)->TEST_Crash(); s = ReOpenNoDelete(); ASSERT_OK(s); assert(db != nullptr); @@ -1624,7 +1678,7 @@ TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { - auto* writer = reinterpret_cast(arg); + auto* writer = static_cast(arg); if (writer->ShouldWriteToWAL()) { t_wait_on_prepare.fetch_add(1); @@ -1706,7 +1760,7 @@ TEST_P(TransactionStressTest, TwoPhaseLongPrepareTest) { if (i % 29 == 0) { // crash fault_fs->SetFilesystemActive(false); - reinterpret_cast(db)->TEST_Crash(); + static_cast(db)->TEST_Crash(); ASSERT_OK(ReOpenNoDelete()); } else if (i % 37 == 0) { // close @@ -1811,7 +1865,7 @@ TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) { // kill and reopen fault_fs->SetFilesystemActive(false); - reinterpret_cast(db)->TEST_Crash(); + static_cast(db)->TEST_Crash(); ASSERT_OK(ReOpenNoDelete()); // commit old txn @@ -2196,7 +2250,7 @@ TEST_P(TransactionTest, TwoPhaseOutOfOrderDelete) { // kill and reopen fault_fs->SetFilesystemActive(false); - reinterpret_cast(db)->TEST_Crash(); + static_cast(db)->TEST_Crash(); ASSERT_OK(ReOpenNoDelete()); assert(db != nullptr); @@ -2787,13 +2841,10 @@ TEST_P(TransactionTest, ColumnFamiliesTest) { // open DB with three column families std::vector column_families; // have to open default column family - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + column_families.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); // open the new column families - column_families.push_back( - ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); - column_families.push_back( - ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + column_families.emplace_back("CFA", ColumnFamilyOptions()); + column_families.emplace_back("CFB", ColumnFamilyOptions()); std::vector handles; @@ -2951,11 +3002,10 @@ TEST_P(TransactionTest, MultiGetBatchedTest) { // open DB with three column families std::vector column_families; // have to open default column family - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + column_families.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); // open the new column families cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); - column_families.push_back(ColumnFamilyDescriptor("CF", cf_options)); + column_families.emplace_back("CF", cf_options); std::vector handles; @@ -3045,11 +3095,10 @@ TEST_P(TransactionTest, MultiGetLargeBatchedTest) { // open DB with three column families std::vector column_families; // have to open default column family - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + column_families.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); // open the new column families cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); - column_families.push_back(ColumnFamilyDescriptor("CF", cf_options)); + column_families.emplace_back("CF", cf_options); std::vector handles; @@ -3877,7 +3926,7 @@ TEST_P(TransactionTest, IteratorTest) { ASSERT_TRUE(iter->Valid()); ASSERT_EQ(results[i], iter->value().ToString()); - s = txn->GetForUpdate(read_options, iter->key(), nullptr); + s = txn->GetForUpdate(read_options, iter->key(), (std::string*)nullptr); if (i == 2) { // "C" was modified after txn's snapshot ASSERT_TRUE(s.IsBusy()); @@ -4800,7 +4849,7 @@ TEST_P(TransactionTest, TimeoutTest) { txn_options0.lock_timeout = 50; // txn timeout no longer infinite Transaction* txn1 = db->BeginTransaction(write_options, txn_options0); - s = txn1->GetForUpdate(read_options, "aaa", nullptr); + s = txn1->GetForUpdate(read_options, "aaa", (std::string*)nullptr); ASSERT_OK(s); // Conflicts with previous GetForUpdate. @@ -4837,7 +4886,7 @@ TEST_P(TransactionTest, TimeoutTest) { txn_options.expiration = 100; // 100ms txn1 = db->BeginTransaction(write_options, txn_options); - s = txn1->GetForUpdate(read_options, "aaa", nullptr); + s = txn1->GetForUpdate(read_options, "aaa", (std::string*)nullptr); ASSERT_OK(s); // Conflicts with previous GetForUpdate. @@ -5457,13 +5506,10 @@ TEST_P(TransactionTest, ToggleAutoCompactionTest) { // open DB with three column families std::vector column_families; // have to open default column family - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + column_families.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); // open the new column families - column_families.push_back( - ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); - column_families.push_back( - ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + column_families.emplace_back("CFA", ColumnFamilyOptions()); + column_families.emplace_back("CFB", ColumnFamilyOptions()); ColumnFamilyOptions* cf_opt_default = &column_families[0].options; ColumnFamilyOptions* cf_opt_cfa = &column_families[1].options; @@ -5860,7 +5906,7 @@ TEST_P(TransactionTest, Optimizations) { // A comparator that uses only the first three bytes class ThreeBytewiseComparator : public Comparator { public: - ThreeBytewiseComparator() {} + ThreeBytewiseComparator() = default; const char* Name() const override { return "test.ThreeBytewiseComparator"; } int Compare(const Slice& a, const Slice& b) const override { Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3); @@ -6198,7 +6244,7 @@ TEST_P(TransactionTest, DuplicateKeys) { delete txn0; // This will check the asserts inside recovery code ASSERT_OK(db->FlushWAL(true)); - reinterpret_cast(db)->TEST_Crash(); + static_cast(db)->TEST_Crash(); ASSERT_OK(ReOpenNoDelete(cfds, &handles)); txn0 = db->GetTransactionByName("xid"); ASSERT_TRUE(txn0 != nullptr); @@ -6222,7 +6268,7 @@ TEST_P(TransactionTest, DuplicateKeys) { // Flush only cf 1 ASSERT_OK(static_cast_with_check(db->GetRootDB()) ->TEST_FlushMemTable(true, false, handles[1])); - reinterpret_cast(db)->TEST_Crash(); + static_cast(db)->TEST_Crash(); ASSERT_OK(ReOpenNoDelete(cfds, &handles)); txn0 = db->GetTransactionByName("xid"); ASSERT_TRUE(txn0 != nullptr); @@ -6260,7 +6306,7 @@ TEST_P(TransactionTest, DuplicateKeys) { // Flush only cf 1 ASSERT_OK(static_cast_with_check(db->GetRootDB()) ->TEST_FlushMemTable(true, false, handles[1])); - reinterpret_cast(db)->TEST_Crash(); + static_cast(db)->TEST_Crash(); ASSERT_OK(ReOpenNoDelete(cfds, &handles)); txn0 = db->GetTransactionByName("xid"); ASSERT_TRUE(txn0 != nullptr); @@ -6293,7 +6339,7 @@ TEST_P(TransactionTest, DuplicateKeys) { // Flush only cf 1 ASSERT_OK(static_cast_with_check(db->GetRootDB()) ->TEST_FlushMemTable(true, false, handles[1])); - reinterpret_cast(db)->TEST_Crash(); + static_cast(db)->TEST_Crash(); ASSERT_OK(ReOpenNoDelete(cfds, &handles)); txn0 = db->GetTransactionByName("xid"); ASSERT_TRUE(txn0 != nullptr); @@ -6320,7 +6366,7 @@ TEST_P(TransactionTest, DuplicateKeys) { // Flush only cf 1 ASSERT_OK(static_cast_with_check(db->GetRootDB()) ->TEST_FlushMemTable(true, false, handles[1])); - reinterpret_cast(db)->TEST_Crash(); + static_cast(db)->TEST_Crash(); ASSERT_OK(ReOpenNoDelete(cfds, &handles)); txn0 = db->GetTransactionByName("xid"); ASSERT_TRUE(txn0 != nullptr); @@ -6347,7 +6393,7 @@ TEST_P(TransactionTest, DuplicateKeys) { // Flush only cf 1 ASSERT_OK(static_cast_with_check(db->GetRootDB()) ->TEST_FlushMemTable(true, false, handles[1])); - reinterpret_cast(db)->TEST_Crash(); + static_cast(db)->TEST_Crash(); ASSERT_OK(ReOpenNoDelete(cfds, &handles)); txn0 = db->GetTransactionByName("xid"); ASSERT_TRUE(txn0 != nullptr); @@ -6463,7 +6509,7 @@ TEST_P(TransactionTest, DoubleCrashInRecovery) { DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); uint64_t wal_file_id = db_impl->TEST_LogfileNumber(); std::string fname = LogFileName(dbname, wal_file_id); - reinterpret_cast(db)->TEST_Crash(); + static_cast(db)->TEST_Crash(); delete txn; delete cf_handle; delete db; @@ -6481,10 +6527,9 @@ TEST_P(TransactionTest, DoubleCrashInRecovery) { // Recover from corruption std::vector handles; std::vector column_families; - column_families.push_back(ColumnFamilyDescriptor(kDefaultColumnFamilyName, - ColumnFamilyOptions())); - column_families.push_back( - ColumnFamilyDescriptor("two", ColumnFamilyOptions())); + column_families.emplace_back(kDefaultColumnFamilyName, + ColumnFamilyOptions()); + column_families.emplace_back("two", ColumnFamilyOptions()); ASSERT_OK(ReOpenNoDelete(column_families, &handles)); assert(db != nullptr); @@ -6628,7 +6673,7 @@ TEST_P(TransactionTest, WriteWithBulkCreatedColumnFamilies) { std::vector cf_names; std::vector cf_handles; - cf_names.push_back("test_cf"); + cf_names.emplace_back("test_cf"); ASSERT_OK(db->CreateColumnFamilies(cf_options, cf_names, &cf_handles)); ASSERT_OK(db->Put(write_options, cf_handles[0], "foo", "bar")); diff --git a/utilities/transactions/write_committed_transaction_ts_test.cc b/utilities/transactions/write_committed_transaction_ts_test.cc index 595e7ad1ae6..0afd57bcc77 100644 --- a/utilities/transactions/write_committed_transaction_ts_test.cc +++ b/utilities/transactions/write_committed_transaction_ts_test.cc @@ -3,12 +3,12 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/utilities/transaction_db.h" -#include "utilities/merge_operators.h" - #include "test_util/testutil.h" +#include "utilities/merge_operators.h" #include "utilities/transactions/transaction_test.h" namespace ROCKSDB_NAMESPACE { @@ -453,13 +453,17 @@ TEST_P(WriteCommittedTxnWithTsTest, GetForUpdate) { std::unique_ptr txn0( NewTxn(WriteOptions(), TransactionOptions())); + // Not set read timestamp, use blind write std::unique_ptr txn1( NewTxn(WriteOptions(), TransactionOptions())); ASSERT_OK(txn1->Put(handles_[1], "key", "value1")); + ASSERT_OK(txn1->Put(handles_[1], "foo", "value1")); ASSERT_OK(txn1->SetCommitTimestamp(24)); ASSERT_OK(txn1->Commit()); txn1.reset(); + // Set read timestamp, use it for validation in GetForUpdate, validation fail + // with conflict: timestamp from db(24) > read timestamp(23) std::string value; ASSERT_OK(txn0->SetReadTimestampForValidation(23)); ASSERT_TRUE( @@ -467,13 +471,71 @@ TEST_P(WriteCommittedTxnWithTsTest, GetForUpdate) { ASSERT_OK(txn0->Rollback()); txn0.reset(); + // Set read timestamp, use it for validation in GetForUpdate, validation pass + // with no conflict: timestamp from db(24) < read timestamp (25) std::unique_ptr txn2( NewTxn(WriteOptions(), TransactionOptions())); ASSERT_OK(txn2->SetReadTimestampForValidation(25)); ASSERT_OK(txn2->GetForUpdate(ReadOptions(), handles_[1], "key", &value)); + // Use a different read timestamp in ReadOptions while doing validation is + // invalid. + ReadOptions read_options; + std::string read_timestamp; + Slice diff_read_ts = EncodeU64Ts(24, &read_timestamp); + read_options.timestamp = &diff_read_ts; + ASSERT_TRUE(txn2->GetForUpdate(read_options, handles_[1], "foo", &value) + .IsInvalidArgument()); ASSERT_OK(txn2->SetCommitTimestamp(26)); ASSERT_OK(txn2->Commit()); txn2.reset(); + + // Set read timestamp, call GetForUpdate without validation, invalid + std::unique_ptr txn3( + NewTxn(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn3->SetReadTimestampForValidation(27)); + ASSERT_TRUE(txn3->GetForUpdate(ReadOptions(), handles_[1], "key", &value, + /*exclusive=*/true, /*do_validate=*/false) + .IsInvalidArgument()); + ASSERT_OK(txn3->Rollback()); + txn3.reset(); + + // Not set read timestamp, call GetForUpdate with validation, invalid + std::unique_ptr txn4( + NewTxn(WriteOptions(), TransactionOptions())); + // ReadOptions.timestamp is not set, invalid + ASSERT_TRUE(txn4->GetForUpdate(ReadOptions(), handles_[1], "key", &value) + .IsInvalidArgument()); + // ReadOptions.timestamp is set, also invalid. + // `SetReadTimestampForValidation` must have been called with the same + // timestamp as in ReadOptions.timestamp for validation. + Slice read_ts = EncodeU64Ts(27, &read_timestamp); + read_options.timestamp = &read_ts; + ASSERT_TRUE(txn4->GetForUpdate(read_options, handles_[1], "key", &value) + .IsInvalidArgument()); + ASSERT_OK(txn4->Rollback()); + txn4.reset(); + + // Not set read timestamp, call GetForUpdate without validation, pass + std::unique_ptr txn5( + NewTxn(WriteOptions(), TransactionOptions())); + // ReadOptions.timestamp is not set, pass + ASSERT_OK(txn5->GetForUpdate(ReadOptions(), handles_[1], "key", &value, + /*exclusive=*/true, /*do_validate=*/false)); + // ReadOptions.timestamp explicitly set to max timestamp, pass + Slice max_ts = MaxU64Ts(); + read_options.timestamp = &max_ts; + ASSERT_OK(txn5->GetForUpdate(read_options, handles_[1], "foo", &value, + /*exclusive=*/true, /*do_validate=*/false)); + // NOTE: this commit timestamp is smaller than the db's timestamp (26), but + // this commit can still go through, that breaks the user-defined timestamp + // invariant: newer user-defined timestamp should have newer sequence number. + // So be aware of skipping UDT based validation. Unless users have their own + // ways to ensure the UDT invariant is met, DO NOT skip it. Ways to ensure + // the UDT invariant include: manage a monotonically increasing timestamp, + // commit transactions in a single thread etc. + ASSERT_OK(txn5->SetCommitTimestamp(3)); + ASSERT_OK(txn5->Commit()); + txn5.reset(); } TEST_P(WriteCommittedTxnWithTsTest, BlindWrite) { @@ -628,7 +690,7 @@ TEST_P(WriteCommittedTxnWithTsTest, CheckKeysForConflicts) { SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "DBImpl::GetLatestSequenceForKey:mem", [&](void* arg) { - auto* const ts_ptr = reinterpret_cast(arg); + auto* const ts_ptr = static_cast(arg); assert(ts_ptr); Slice ts_slc = *ts_ptr; uint64_t last_ts = 0; diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index d6f1ace7373..554499503d7 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -1197,7 +1197,9 @@ TEST_P(SnapshotConcurrentAccessTest, SnapshotConcurrentAccess) { // create a common_snapshots for each combination. size_t new_comb_cnt = size_t(1) << old_size; for (size_t new_comb = 0; new_comb < new_comb_cnt; new_comb++, loop_id++) { - if (loop_id % split_cnt_ != split_id_) continue; + if (loop_id % split_cnt_ != split_id_) { + continue; + } printf("."); // To signal progress fflush(stdout); std::vector common_snapshots; @@ -1521,7 +1523,7 @@ TEST_P(WritePreparedTransactionTest, TxnInitialize) { txn_options.set_snapshot = true; Transaction* txn1 = db->BeginTransaction(write_options, txn_options); auto snap = txn1->GetSnapshot(); - auto snap_impl = reinterpret_cast(snap); + auto snap_impl = static_cast(snap); // If ::Initialize calls the overriden SetSnapshot, min_uncommitted_ must be // udpated ASSERT_GT(snap_impl->min_uncommitted_, kMinUnCommittedSeq); @@ -1619,7 +1621,7 @@ TEST_P(WritePreparedTransactionTest, SmallestUnCommittedSeq) { } }); ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { - while (1) { + while (true) { MutexLock l(&mutex); if (txns.empty()) { break; @@ -1668,7 +1670,9 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) { ASSERT_OK(ReOpen()); } - if (n % split_cnt_ != split_id_) continue; + if (n % split_cnt_ != split_id_) { + continue; + } if (n % 1000 == 0) { printf("Tested %" ROCKSDB_PRIszt " cases so far\n", n); } @@ -2643,7 +2647,7 @@ TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction2) { int count_value = 0; auto callback = [&](void* arg) { - auto* ikey = reinterpret_cast(arg); + auto* ikey = static_cast(arg); if (ikey->user_key == "key1") { count_value++; if (count_value == 2) { @@ -3082,8 +3086,7 @@ TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing) { SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) { - const auto* const ikey = - reinterpret_cast(arg); + const auto* const ikey = static_cast(arg); assert(ikey); if (ikey->user_key == "b") { assert(ikey->type == kTypeValue); @@ -3156,8 +3159,7 @@ TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing2) { SyncPoint::GetInstance()->SetCallBack( "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) { - const auto* const ikey = - reinterpret_cast(arg); + const auto* const ikey = static_cast(arg); assert(ikey); if (ikey->user_key == "b") { assert(ikey->type == kTypeValue); @@ -3222,7 +3224,7 @@ TEST_P(WritePreparedTransactionTest, SingleDeleteAfterRollback) { SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* arg) { - const auto* const c = reinterpret_cast(arg); + const auto* const c = static_cast(arg); assert(!c); // Trigger once only for SingleDelete during flush. if (0 == count) { @@ -4040,7 +4042,7 @@ TEST_P(WritePreparedTransactionTest, WC_WP_WALForwardIncompatibility) { int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); - if (getenv("CIRCLECI")) { + if (getenv("CIRCLECI") || getenv("GITHUB_ACTIONS")) { // Looking for backtrace on "Resource temporarily unavailable" exceptions ::testing::FLAGS_gtest_catch_exceptions = false; } diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index 58126a47508..ddaf077ac3f 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -306,6 +306,7 @@ Status WritePreparedTxn::RollbackInternal() { auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap(); auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap(); auto read_at_seq = kMaxSequenceNumber; + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions roptions; // to prevent callback's seq to be overrriden inside DBImpk::Get roptions.snapshot = wpt_db_->GetMaxSnapshot(); diff --git a/utilities/transactions/write_prepared_txn.h b/utilities/transactions/write_prepared_txn.h index 9a0fb81d19c..24e667f78aa 100644 --- a/utilities/transactions/write_prepared_txn.h +++ b/utilities/transactions/write_prepared_txn.h @@ -51,16 +51,15 @@ class WritePreparedTxn : public PessimisticTransaction { // seq in the WAL that is also published, LastPublishedSequence, as opposed to // the last seq in the memtable. using Transaction::Get; - virtual Status Get(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override; + Status Get(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; using Transaction::MultiGet; - virtual void MultiGet(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family, - const size_t num_keys, const Slice* keys, - PinnableSlice* values, Status* statuses, - const bool sorted_input = false) override; + void MultiGet(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; // Note: The behavior is undefined in presence of interleaved writes to the // same transaction. @@ -68,11 +67,11 @@ class WritePreparedTxn : public PessimisticTransaction { // based on the last seq in the WAL that is also published, // LastPublishedSequence, as opposed to the last seq in the memtable. using Transaction::GetIterator; - virtual Iterator* GetIterator(const ReadOptions& options) override; - virtual Iterator* GetIterator(const ReadOptions& options, - ColumnFamilyHandle* column_family) override; + Iterator* GetIterator(const ReadOptions& options) override; + Iterator* GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; - virtual void SetSnapshot() override; + void SetSnapshot() override; protected: void Initialize(const TransactionOptions& txn_options) override; @@ -106,11 +105,10 @@ class WritePreparedTxn : public PessimisticTransaction { Status RollbackInternal() override; - virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family, - const Slice& key, - SequenceNumber* tracked_at_seq) override; + Status ValidateSnapshot(ColumnFamilyHandle* column_family, const Slice& key, + SequenceNumber* tracked_at_seq) override; - virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override; + Status RebuildFromWriteBatch(WriteBatch* src_batch) override; WritePreparedTxnDB* wpt_db_; // Number of sub-batches in prepare diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 91a81d15893..a68e635f634 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -46,7 +46,7 @@ Status WritePreparedTxnDB::Initialize( assert(dbimpl != nullptr); auto rtxns = dbimpl->recovered_transactions(); std::map ordered_seq_cnt; - for (auto rtxn : rtxns) { + for (const auto& rtxn : rtxns) { // There should only one batch for WritePrepared policy. assert(rtxn.second->batches_.size() == 1); const auto& seq = rtxn.second->batches_.begin()->first; @@ -249,13 +249,18 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, Status WritePreparedTxnDB::Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* value) { + const Slice& key, PinnableSlice* value, + std::string* timestamp) { if (_read_options.io_activity != Env::IOActivity::kUnknown && _read_options.io_activity != Env::IOActivity::kGet) { return Status::InvalidArgument( "Can only call Get with `ReadOptions::io_activity` is " "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`"); } + if (timestamp) { + return Status::NotSupported( + "Get() that returns timestamp is not implemented"); + } ReadOptions read_options(_read_options); if (read_options.io_activity == Env::IOActivity::kUnknown) { read_options.io_activity = Env::IOActivity::kGet; @@ -325,24 +330,34 @@ void WritePreparedTxnDB::UpdateCFComparatorMap(ColumnFamilyHandle* h) { handle_map_.reset(handle_map); } -std::vector WritePreparedTxnDB::MultiGet( - const ReadOptions& _read_options, - const std::vector& column_family, - const std::vector& keys, std::vector* values) { +void WritePreparedTxnDB::MultiGet(const ReadOptions& _read_options, + const size_t num_keys, + ColumnFamilyHandle** column_families, + const Slice* keys, PinnableSlice* values, + std::string* timestamps, Status* statuses, + const bool /*sorted_input*/) { assert(values); - size_t num_keys = keys.size(); - std::vector stat_list(num_keys); + Status s; if (_read_options.io_activity != Env::IOActivity::kUnknown && _read_options.io_activity != Env::IOActivity::kMultiGet) { - Status s = Status::InvalidArgument( + s = Status::InvalidArgument( "Can only call MultiGet with `ReadOptions::io_activity` is " "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + } + + if (s.ok()) { + if (timestamps) { + s = Status::NotSupported( + "MultiGet() returning timestamps not implemented."); + } + } + if (!s.ok()) { for (size_t i = 0; i < num_keys; ++i) { - stat_list[i] = s; + statuses[i] = s; } - return stat_list; + return; } ReadOptions read_options(_read_options); @@ -350,13 +365,10 @@ std::vector WritePreparedTxnDB::MultiGet( read_options.io_activity = Env::IOActivity::kMultiGet; } - values->resize(num_keys); - for (size_t i = 0; i < num_keys; ++i) { - stat_list[i] = - this->GetImpl(read_options, column_family[i], keys[i], &(*values)[i]); + statuses[i] = + this->GetImpl(read_options, column_families[i], keys[i], &values[i]); } - return stat_list; } // Struct to hold ownership of snapshot and read callback for iterator cleanup. @@ -373,7 +385,7 @@ struct WritePreparedTxnDB::IteratorState { namespace { static void CleanupWritePreparedTxnDBIterator(void* arg1, void* /*arg2*/) { - delete reinterpret_cast(arg1); + delete static_cast(arg1); } } // anonymous namespace @@ -409,12 +421,12 @@ Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& _read_options, own_snapshot = std::make_shared(db_impl_, snapshot); } assert(snapshot_seq != kMaxSequenceNumber); - auto* cfd = - static_cast_with_check(column_family)->cfd(); + auto* cfh = static_cast_with_check(column_family); + auto* cfd = cfh->cfd(); auto* state = new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted); SuperVersion* super_version = cfd->GetReferencedSuperVersion(db_impl_); - auto* db_iter = db_impl_->NewIteratorImpl(read_options, cfd, super_version, + auto* db_iter = db_impl_->NewIteratorImpl(read_options, cfh, super_version, snapshot_seq, &state->callback, expose_blob_index, allow_refresh); db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr); @@ -458,12 +470,12 @@ Status WritePreparedTxnDB::NewIterators( iterators->clear(); iterators->reserve(column_families.size()); for (auto* column_family : column_families) { - auto* cfd = - static_cast_with_check(column_family)->cfd(); + auto* cfh = static_cast_with_check(column_family); + auto* cfd = cfh->cfd(); auto* state = new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted); SuperVersion* super_version = cfd->GetReferencedSuperVersion(db_impl_); - auto* db_iter = db_impl_->NewIteratorImpl(read_options, cfd, super_version, + auto* db_iter = db_impl_->NewIteratorImpl(read_options, cfh, super_version, snapshot_seq, &state->callback, expose_blob_index, allow_refresh); db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr); @@ -636,7 +648,7 @@ void WritePreparedTxnDB::RemovePrepared(const uint64_t prepare_seq, const size_t batch_cnt) { TEST_SYNC_POINT_CALLBACK( "RemovePrepared:Start", - const_cast(reinterpret_cast(&prepare_seq))); + const_cast(static_cast(&prepare_seq))); TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:pause"); TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:resume"); ROCKS_LOG_DETAILS(info_log_, @@ -813,6 +825,7 @@ void WritePreparedTxnDB::AdvanceSeqByOne() { // Inserting an empty value will i) let the max evicted entry to be // published, i.e., max == last_published, increase the last published to // be one beyond max, i.e., max < last_published. + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions woptions; TransactionOptions txn_options; Transaction* txn0 = BeginTransaction(woptions, txn_options, nullptr); diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h index 1d33db55054..1d24b1fe063 100644 --- a/utilities/transactions/write_prepared_txn_db.h +++ b/utilities/transactions/write_prepared_txn_db.h @@ -60,9 +60,8 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { virtual ~WritePreparedTxnDB(); - virtual Status Initialize( - const std::vector& compaction_enabled_cf_indices, - const std::vector& handles) override; + Status Initialize(const std::vector& compaction_enabled_cf_indices, + const std::vector& handles) override; Transaction* BeginTransaction(const WriteOptions& write_options, const TransactionOptions& txn_options, @@ -83,26 +82,24 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { size_t batch_cnt, WritePreparedTxn* txn); using DB::Get; - virtual Status Get(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override; + Status Get(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) override; using DB::MultiGet; - virtual std::vector MultiGet( - const ReadOptions& _read_options, - const std::vector& column_family, - const std::vector& keys, - std::vector* values) override; + void MultiGet(const ReadOptions& _read_options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool sorted_input) override; using DB::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family) override; + Iterator* NewIterator(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family) override; using DB::NewIterators; - virtual Status NewIterators( - const ReadOptions& _read_options, - const std::vector& column_families, - std::vector* iterators) override; + Status NewIterators(const ReadOptions& _read_options, + const std::vector& column_families, + std::vector* iterators) override; // Check whether the transaction that wrote the value with sequence number seq // is visible to the snapshot with sequence number snapshot_seq. @@ -440,12 +437,11 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { const std::vector& handles) override; void UpdateCFComparatorMap(ColumnFamilyHandle* handle) override; - virtual const Snapshot* GetSnapshot() override; + const Snapshot* GetSnapshot() override; SnapshotImpl* GetSnapshotInternal(bool for_ww_conflict_check); protected: - virtual Status VerifyCFOptions( - const ColumnFamilyOptions& cf_options) override; + Status VerifyCFOptions(const ColumnFamilyOptions& cf_options) override; // Assign the min and max sequence numbers for reading from the db. A seq > // max is not valid, and a seq < min is valid, and a min <= seq < max requires // further checking. Normally max is defined by the snapshot and min is by @@ -845,7 +841,7 @@ class WritePreparedTxnReadCallback : public ReadCallback { // Will be called to see if the seq number visible; if not it moves on to // the next seq number. - inline virtual bool IsVisibleFullCheck(SequenceNumber seq) override { + inline bool IsVisibleFullCheck(SequenceNumber seq) override { auto snapshot = max_visible_seq_; bool snap_released = false; auto ret = @@ -882,10 +878,9 @@ class AddPreparedCallback : public PreReleaseCallback { first_prepare_batch_(first_prepare_batch) { (void)two_write_queues_; // to silence unused private field warning } - virtual Status Callback(SequenceNumber prepare_seq, - bool is_mem_disabled __attribute__((__unused__)), - uint64_t log_number, size_t index, - size_t total) override { + Status Callback(SequenceNumber prepare_seq, + bool is_mem_disabled __attribute__((__unused__)), + uint64_t log_number, size_t index, size_t total) override { assert(index < total); // To reduce the cost of lock acquisition competing with the concurrent // prepare requests, lock on the first callback and unlock on the last. @@ -946,10 +941,9 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { assert((aux_batch_cnt_ > 0) != (aux_seq == kMaxSequenceNumber)); // xor } - virtual Status Callback(SequenceNumber commit_seq, - bool is_mem_disabled __attribute__((__unused__)), - uint64_t, size_t /*index*/, - size_t /*total*/) override { + Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), uint64_t, + size_t /*index*/, size_t /*total*/) override { // Always commit from the 2nd queue assert(!db_impl_->immutable_db_options().two_write_queues || is_mem_disabled); diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index c30cf9e1f04..cdc888b5d79 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -727,6 +727,7 @@ Status WriteUnpreparedTxn::RollbackInternal() { assert(GetId() > 0); Status s; auto read_at_seq = kMaxSequenceNumber; + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions roptions; // to prevent callback's seq to be overrriden inside DBImpk::Get roptions.snapshot = wpt_db_->GetMaxSnapshot(); @@ -882,6 +883,7 @@ Status WriteUnpreparedTxn::RollbackToSavePointInternal() { assert(save_points_ != nullptr && save_points_->size() > 0); const LockTracker& tracked_keys = *save_points_->top().new_locks_; + // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions roptions; roptions.snapshot = top.snapshot_->snapshot(); SequenceNumber min_uncommitted = @@ -1021,8 +1023,8 @@ Status WriteUnpreparedTxn::GetImpl(const ReadOptions& options, namespace { static void CleanupWriteUnpreparedWBWIIterator(void* arg1, void* arg2) { - auto txn = reinterpret_cast(arg1); - auto iter = reinterpret_cast(arg2); + auto txn = static_cast(arg1); + auto iter = static_cast(arg2); txn->RemoveActiveIterator(iter); } } // anonymous namespace diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h index fe47c8cd8a3..606ad6e3fcf 100644 --- a/utilities/transactions/write_unprepared_txn.h +++ b/utilities/transactions/write_unprepared_txn.h @@ -74,7 +74,7 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback { assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot); } - virtual bool IsVisibleFullCheck(SequenceNumber seq) override; + bool IsVisibleFullCheck(SequenceNumber seq) override; inline bool valid() { valid_checked_ = true; @@ -117,32 +117,27 @@ class WriteUnpreparedTxn : public WritePreparedTxn { virtual ~WriteUnpreparedTxn(); using TransactionBaseImpl::Put; - virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value, - const bool assume_tracked = false) override; - virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, - const SliceParts& value, - const bool assume_tracked = false) override; + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) override; using TransactionBaseImpl::Merge; - virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value, - const bool assume_tracked = false) override; + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; using TransactionBaseImpl::Delete; - virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key, - const bool assume_tracked = false) override; - virtual Status Delete(ColumnFamilyHandle* column_family, - const SliceParts& key, - const bool assume_tracked = false) override; + Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; using TransactionBaseImpl::SingleDelete; - virtual Status SingleDelete(ColumnFamilyHandle* column_family, - const Slice& key, - const bool assume_tracked = false) override; - virtual Status SingleDelete(ColumnFamilyHandle* column_family, - const SliceParts& key, - const bool assume_tracked = false) override; + Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; // In WriteUnprepared, untracked writes will break snapshot validation logic. // Snapshot validation will only check the largest sequence number of a key to @@ -153,11 +148,9 @@ class WriteUnpreparedTxn : public WritePreparedTxn { // validate all values larger than snap_seq. Otherwise, we should return // Status::NotSupported for untracked writes. - virtual Status RebuildFromWriteBatch(WriteBatch*) override; + Status RebuildFromWriteBatch(WriteBatch*) override; - virtual uint64_t GetLastLogNumber() const override { - return last_log_number_; - } + uint64_t GetLastLogNumber() const override { return last_log_number_; } void RemoveActiveIterator(Iterator* iter) { active_iterators_.erase( @@ -184,25 +177,23 @@ class WriteUnpreparedTxn : public WritePreparedTxn { // Get and GetIterator needs to be overridden so that a ReadCallback to // handle read-your-own-write is used. using Transaction::Get; - virtual Status Get(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override; + Status Get(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; using Transaction::MultiGet; - virtual void MultiGet(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family, - const size_t num_keys, const Slice* keys, - PinnableSlice* values, Status* statuses, - const bool sorted_input = false) override; + void MultiGet(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; using Transaction::GetIterator; - virtual Iterator* GetIterator(const ReadOptions& options) override; - virtual Iterator* GetIterator(const ReadOptions& options, - ColumnFamilyHandle* column_family) override; + Iterator* GetIterator(const ReadOptions& options) override; + Iterator* GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; - virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family, - const Slice& key, - SequenceNumber* tracked_at_seq) override; + Status ValidateSnapshot(ColumnFamilyHandle* column_family, const Slice& key, + SequenceNumber* tracked_at_seq) override; private: friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test; @@ -281,7 +272,7 @@ class WriteUnpreparedTxn : public WritePreparedTxn { SavePoint(const std::map& seqs, ManagedSnapshot* snapshot) - : unprep_seqs_(seqs), snapshot_(snapshot){}; + : unprep_seqs_(seqs), snapshot_(snapshot){} }; // We have 3 data structures holding savepoint information: diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index 1d75dd44901..0f52cd2861c 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -37,6 +37,7 @@ Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction( // MemTableInserter during recovery to actually do writes into the DB // instead of just dropping the in-memory write batch. // + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions w_options; class InvalidSnapshotReadCallback : public ReadCallback { @@ -249,7 +250,7 @@ Status WriteUnpreparedTxnDB::Initialize( // create 'real' transactions from recovered shell transactions auto rtxns = dbimpl->recovered_transactions(); std::map ordered_seq_cnt; - for (auto rtxn : rtxns) { + for (const auto& rtxn : rtxns) { auto recovered_trx = rtxn.second; assert(recovered_trx); assert(recovered_trx->batches_.size() >= 1); @@ -262,6 +263,7 @@ Status WriteUnpreparedTxnDB::Initialize( continue; } + // TODO: plumb Env::IOActivity, Env::IOPriority WriteOptions w_options; w_options.sync = true; TransactionOptions t_options; @@ -332,7 +334,7 @@ Status WriteUnpreparedTxnDB::Initialize( Status s; // Rollback unprepared transactions. - for (auto rtxn : rtxns) { + for (const auto& rtxn : rtxns) { auto recovered_trx = rtxn.second; if (recovered_trx->unprepared_) { s = RollbackRecoveredTransaction(recovered_trx); @@ -381,7 +383,7 @@ struct WriteUnpreparedTxnDB::IteratorState { namespace { static void CleanupWriteUnpreparedTxnDBIterator(void* arg1, void* /*arg2*/) { - delete reinterpret_cast(arg1); + delete static_cast(arg1); } } // anonymous namespace @@ -468,13 +470,13 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& _read_options, min_uncommitted = static_cast_with_check(snapshot)->min_uncommitted_; - auto* cfd = - static_cast_with_check(column_family)->cfd(); + auto* cfh = static_cast_with_check(column_family); + auto* cfd = cfh->cfd(); auto* state = new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn); SuperVersion* super_version = cfd->GetReferencedSuperVersion(db_impl_); auto* db_iter = db_impl_->NewIteratorImpl( - read_options, cfd, super_version, state->MaxVisibleSeq(), + read_options, cfh, super_version, state->MaxVisibleSeq(), &state->callback, expose_blob_index, allow_refresh); db_iter->RegisterCleanup(CleanupWriteUnpreparedTxnDBIterator, state, nullptr); return db_iter; diff --git a/utilities/transactions/write_unprepared_txn_db.h b/utilities/transactions/write_unprepared_txn_db.h index 409d73a0a88..82f116e1221 100644 --- a/utilities/transactions/write_unprepared_txn_db.h +++ b/utilities/transactions/write_unprepared_txn_db.h @@ -54,10 +54,9 @@ class WriteUnpreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { assert(unprep_seqs.size() > 0); } - virtual Status Callback(SequenceNumber commit_seq, - bool is_mem_disabled __attribute__((__unused__)), - uint64_t, size_t /*index*/, - size_t /*total*/) override { + Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), uint64_t, + size_t /*index*/, size_t /*total*/) override { const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1) ? commit_seq : commit_seq + data_batch_cnt_ - 1; diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc index e4bff782658..55354c6cbce 100644 --- a/utilities/ttl/db_ttl_impl.cc +++ b/utilities/ttl/db_ttl_impl.cc @@ -109,8 +109,7 @@ bool TtlMergeOperator::PartialMergeMulti(const Slice& key, return false; } - operands_without_ts.push_back( - Slice(operand.data(), operand.size() - ts_len)); + operands_without_ts.emplace_back(operand.data(), operand.size() - ts_len); } // Apply the user partial-merge operator (store result in *new_value) @@ -339,8 +338,7 @@ Status DBWithTTL::Open(const Options& options, const std::string& dbname, DBOptions db_options(options); ColumnFamilyOptions cf_options(options); std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); std::vector handles; Status s = DBWithTTL::Open(db_options, dbname, column_families, &handles, dbptr, {ttl}, read_only); @@ -493,7 +491,11 @@ Status DBWithTTLImpl::Put(const WriteOptions& options, Status DBWithTTLImpl::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) { + PinnableSlice* value, std::string* timestamp) { + if (timestamp) { + return Status::NotSupported( + "Get() that returns timestamp is not supported"); + } Status st = db_->Get(options, column_family, key, value); if (!st.ok()) { return st; @@ -505,22 +507,34 @@ Status DBWithTTLImpl::Get(const ReadOptions& options, return StripTS(value); } -std::vector DBWithTTLImpl::MultiGet( - const ReadOptions& options, - const std::vector& column_family, - const std::vector& keys, std::vector* values) { - auto statuses = db_->MultiGet(options, column_family, keys, values); - for (size_t i = 0; i < keys.size(); ++i) { +void DBWithTTLImpl::MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, + const Slice* keys, PinnableSlice* values, + std::string* timestamps, Status* statuses, + const bool /*sorted_input*/) { + if (timestamps) { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = Status::NotSupported( + "MultiGet() returning timestamps not implemented."); + } + return; + } + + db_->MultiGet(options, num_keys, column_families, keys, values, timestamps, + statuses); + for (size_t i = 0; i < num_keys; ++i) { if (!statuses[i].ok()) { continue; } - statuses[i] = SanityCheckTimestamp((*values)[i]); + PinnableSlice tmp_val = std::move(values[i]); + values[i].PinSelf(tmp_val); + assert(!values[i].IsPinned()); + statuses[i] = SanityCheckTimestamp(values[i]); if (!statuses[i].ok()) { continue; } - statuses[i] = StripTS(&(*values)[i]); + statuses[i] = StripTS(&values[i]); } - return statuses; } bool DBWithTTLImpl::KeyMayExist(const ReadOptions& options, @@ -615,7 +629,9 @@ void DBWithTTLImpl::SetTtl(ColumnFamilyHandle* h, int32_t ttl) { opts = GetOptions(h); filter = std::static_pointer_cast( opts.compaction_filter_factory); - if (!filter) return; + if (!filter) { + return; + } filter->SetTtl(ttl); } diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h index b125d79b067..731cd3955fe 100644 --- a/utilities/ttl/db_ttl_impl.h +++ b/utilities/ttl/db_ttl_impl.h @@ -36,7 +36,7 @@ class DBWithTTLImpl : public DBWithTTL { virtual ~DBWithTTLImpl(); - virtual Status Close() override; + Status Close() override; Status CreateColumnFamilyWithTtl(const ColumnFamilyOptions& options, const std::string& column_family_name, @@ -48,40 +48,36 @@ class DBWithTTLImpl : public DBWithTTL { ColumnFamilyHandle** handle) override; using StackableDB::Put; - virtual Status Put(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& val) override; + Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) override; using StackableDB::Get; - virtual Status Get(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + std::string* timestamp) override; using StackableDB::MultiGet; - virtual std::vector MultiGet( - const ReadOptions& options, - const std::vector& column_family, - const std::vector& keys, - std::vector* values) override; + void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool sorted_input) override; using StackableDB::KeyMayExist; - virtual bool KeyMayExist(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - std::string* value, - bool* value_found = nullptr) override; + bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found = nullptr) override; using StackableDB::Merge; - virtual Status Merge(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override; + Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) override; - virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + Status Write(const WriteOptions& opts, WriteBatch* updates) override; using StackableDB::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& _read_options, - ColumnFamilyHandle* column_family) override; + Iterator* NewIterator(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family) override; - virtual DB* GetBaseDB() override { return db_; } + DB* GetBaseDB() override { return db_; } static bool IsStale(const Slice& value, int32_t ttl, SystemClock* clock); @@ -157,8 +153,8 @@ class TtlCompactionFilter : public LayeredCompactionFilterBase { std::unique_ptr _user_comp_filter_from_factory = nullptr); - virtual bool Filter(int level, const Slice& key, const Slice& old_val, - std::string* new_val, bool* value_changed) const override; + bool Filter(int level, const Slice& key, const Slice& old_val, + std::string* new_val, bool* value_changed) const override; const char* Name() const override { return kClassName(); } static const char* kClassName() { return "TtlCompactionFilter"; } diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc index da1d2d0da96..bab175d6f33 100644 --- a/utilities/ttl/ttl_test.cc +++ b/utilities/ttl/ttl_test.cc @@ -674,10 +674,10 @@ TEST_F(TtlTest, ColumnFamiliesTest) { delete db; std::vector column_families; - column_families.push_back(ColumnFamilyDescriptor( - kDefaultColumnFamilyName, ColumnFamilyOptions(options))); - column_families.push_back(ColumnFamilyDescriptor( - "ttl_column_family", ColumnFamilyOptions(options))); + column_families.emplace_back(kDefaultColumnFamilyName, + ColumnFamilyOptions(options)); + column_families.emplace_back("ttl_column_family", + ColumnFamilyOptions(options)); std::vector handles; diff --git a/utilities/util_merge_operators_test.cc b/utilities/util_merge_operators_test.cc index fed6f1a75ab..692f1f0071f 100644 --- a/utilities/util_merge_operators_test.cc +++ b/utilities/util_merge_operators_test.cc @@ -11,7 +11,7 @@ namespace ROCKSDB_NAMESPACE { class UtilMergeOperatorTest : public testing::Test { public: - UtilMergeOperatorTest() {} + UtilMergeOperatorTest() = default; std::string FullMergeV2(std::string existing_value, std::vector operands, diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index bbfc60f9b48..0011401900e 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -10,8 +10,10 @@ #include "db/column_family.h" #include "db/db_impl/db_impl.h" +#include "db/dbformat.h" #include "db/merge_context.h" #include "db/merge_helper.h" +#include "db/wide/wide_columns_helper.h" #include "memory/arena.h" #include "memtable/skiplist.h" #include "options/db_options.h" @@ -201,13 +203,14 @@ Status WriteBatchWithIndex::Rep::ReBuildIndex() { while (s.ok() && !input.empty()) { Slice key, value, blob, xid; uint32_t column_family_id = 0; // default + uint64_t unix_write_time = 0; char tag = 0; // set offset of current entry for call to AddNewEntry() last_entry_offset = input.data() - write_batch.Data().data(); s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key, &value, - &blob, &xid); + &blob, &xid, &unix_write_time); if (!s.ok()) { break; } @@ -253,6 +256,20 @@ Status WriteBatchWithIndex::Rep::ReBuildIndex() { case kTypeRollbackXID: case kTypeNoop: break; + case kTypeColumnFamilyWideColumnEntity: + case kTypeWideColumnEntity: + found++; + if (!UpdateExistingEntryWithCfId(column_family_id, key, + kPutEntityRecord)) { + AddNewEntry(column_family_id); + } + break; + case kTypeColumnFamilyValuePreferredSeqno: + case kTypeValuePreferredSeqno: + // TimedPut is not supported in Transaction APIs. + return Status::Corruption( + "unexpected WriteBatch tag in ReBuildIndex", + std::to_string(static_cast(tag))); default: return Status::Corruption( "unknown WriteBatch tag in ReBuildIndex", @@ -273,7 +290,7 @@ WriteBatchWithIndex::WriteBatchWithIndex( : rep(new Rep(default_index_comparator, reserved_bytes, max_bytes, overwrite_key, protection_bytes_per_key)) {} -WriteBatchWithIndex::~WriteBatchWithIndex() {} +WriteBatchWithIndex::~WriteBatchWithIndex() = default; WriteBatchWithIndex::WriteBatchWithIndex(WriteBatchWithIndex&&) = default; @@ -352,6 +369,22 @@ Status WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family, return Status::NotSupported(); } +Status WriteBatchWithIndex::PutEntity(ColumnFamilyHandle* column_family, + const Slice& key, + const WideColumns& columns) { + assert(rep); + + rep->SetLastEntryOffset(); + + const Status s = rep->write_batch.PutEntity(column_family, key, columns); + + if (s.ok()) { + rep->AddOrUpdateIndex(column_family, key, kPutEntityRecord); + } + + return s; +} + Status WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family, const Slice& key) { rep->SetLastEntryOffset(); @@ -434,6 +467,26 @@ Status WriteBatchWithIndex::PutLogData(const Slice& blob) { void WriteBatchWithIndex::Clear() { rep->Clear(); } +namespace { +Status PostprocessStatusBatchOnly(const Status& s, + WBWIIteratorImpl::Result result) { + if (result == WBWIIteratorImpl::kDeleted || + result == WBWIIteratorImpl::kNotFound) { + s.PermitUncheckedError(); + return Status::NotFound(); + } + + if (result == WBWIIteratorImpl::kMergeInProgress) { + s.PermitUncheckedError(); + return Status::MergeInProgress(); + } + + assert(result == WBWIIteratorImpl::kFound || + result == WBWIIteratorImpl::kError); + return s; +} +} // anonymous namespace + Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family, const DBOptions& /* options */, const Slice& key, std::string* value) { @@ -442,23 +495,28 @@ Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family, auto result = WriteBatchWithIndexInternal::GetFromBatch( this, column_family, key, &merge_context, value, &s); - switch (result) { - case WBWIIteratorImpl::kFound: - case WBWIIteratorImpl::kError: - // use returned status - break; - case WBWIIteratorImpl::kDeleted: - case WBWIIteratorImpl::kNotFound: - s = Status::NotFound(); - break; - case WBWIIteratorImpl::kMergeInProgress: - s = Status::MergeInProgress(); - break; - default: - assert(false); + return PostprocessStatusBatchOnly(s, result); +} + +Status WriteBatchWithIndex::GetEntityFromBatch( + ColumnFamilyHandle* column_family, const Slice& key, + PinnableWideColumns* columns) { + if (!column_family) { + return Status::InvalidArgument( + "Cannot call GetEntityFromBatch without a column family handle"); } - return s; + if (!columns) { + return Status::InvalidArgument( + "Cannot call GetEntityFromBatch without a PinnableWideColumns object"); + } + + MergeContext merge_context; + Status s; + auto result = WriteBatchWithIndexInternal::GetEntityFromBatch( + this, column_family, key, &merge_context, columns, &s); + + return PostprocessStatusBatchOnly(s, result); } Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, @@ -509,6 +567,43 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, nullptr); } +void WriteBatchWithIndex::MergeAcrossBatchAndDB( + ColumnFamilyHandle* column_family, const Slice& key, + const PinnableWideColumns& existing, const MergeContext& merge_context, + PinnableSlice* value, Status* status) { + assert(value); + assert(status); + assert(status->ok() || status->IsNotFound()); + + std::string result_value; + + if (status->ok()) { + if (WideColumnsHelper::HasDefaultColumnOnly(existing.columns())) { + *status = WriteBatchWithIndexInternal::MergeKeyWithBaseValue( + column_family, key, MergeHelper::kPlainBaseValue, + WideColumnsHelper::GetDefaultColumn(existing.columns()), + merge_context, &result_value, + static_cast(nullptr)); + } else { + *status = WriteBatchWithIndexInternal::MergeKeyWithBaseValue( + column_family, key, MergeHelper::kWideBaseValue, existing.columns(), + merge_context, &result_value, + static_cast(nullptr)); + } + } else { + assert(status->IsNotFound()); + *status = WriteBatchWithIndexInternal::MergeKeyWithNoBaseValue( + column_family, key, merge_context, &result_value, + static_cast(nullptr)); + } + + if (status->ok()) { + value->Reset(); + *value->GetSelf() = std::move(result_value); + value->PinSelf(); + } +} + Status WriteBatchWithIndex::GetFromBatchAndDB( DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val, ReadCallback* callback) { @@ -537,45 +632,39 @@ Status WriteBatchWithIndex::GetFromBatchAndDB( if (result == WBWIIteratorImpl::kFound) { pinnable_val->PinSelf(); return s; - } else if (!s.ok() || result == WBWIIteratorImpl::kError) { + } + + if (!s.ok() || result == WBWIIteratorImpl::kError) { return s; - } else if (result == WBWIIteratorImpl::kDeleted) { + } + + if (result == WBWIIteratorImpl::kDeleted) { return Status::NotFound(); } - assert(result == WBWIIteratorImpl::kMergeInProgress || - result == WBWIIteratorImpl::kNotFound); // Did not find key in batch OR could not resolve Merges. Try DB. - if (!callback) { - s = static_cast_with_check(db->GetRootDB()) - ->GetImpl(read_options, column_family, key, pinnable_val); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + + // Note: we have to retrieve all columns if we have to merge KVs from the + // batch and the DB; otherwise, the default column is sufficient. + PinnableWideColumns existing; + + if (result == WBWIIteratorImpl::kMergeInProgress) { + get_impl_options.columns = &existing; } else { - DBImpl::GetImplOptions get_impl_options; - get_impl_options.column_family = column_family; + assert(result == WBWIIteratorImpl::kNotFound); get_impl_options.value = pinnable_val; - get_impl_options.callback = callback; - s = static_cast_with_check(db->GetRootDB()) - ->GetImpl(read_options, key, get_impl_options); } - if (s.ok() || s.IsNotFound()) { // DB Get Succeeded - if (result == WBWIIteratorImpl::kMergeInProgress) { - // Merge result from DB with merges in Batch - std::string merge_result; - - if (s.ok()) { - s = WriteBatchWithIndexInternal::MergeKeyWithPlainBaseValue( - column_family, key, *pinnable_val, merge_context, &merge_result); - } else { - assert(s.IsNotFound()); - s = WriteBatchWithIndexInternal::MergeKeyWithNoBaseValue( - column_family, key, merge_context, &merge_result); - } - if (s.ok()) { - pinnable_val->Reset(); - *pinnable_val->GetSelf() = std::move(merge_result); - pinnable_val->PinSelf(); - } + get_impl_options.callback = callback; + s = static_cast_with_check(db->GetRootDB()) + ->GetImpl(read_options, key, get_impl_options); + + if (result == WBWIIteratorImpl::kMergeInProgress) { + if (s.ok() || s.IsNotFound()) { // DB lookup succeeded + MergeAcrossBatchAndDB(column_family, key, existing, merge_context, + pinnable_val, &s); } } @@ -612,80 +701,101 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB( return; } - autovector key_context; - autovector sorted_keys; - // To hold merges from the write batch - autovector, - MultiGetContext::MAX_BATCH_SIZE> - merges; + struct MergeTuple { + MergeTuple(const Slice& _key, Status* _s, MergeContext&& _merge_context, + PinnableSlice* _value) + : key(_key), + s(_s), + merge_context(std::move(_merge_context)), + value(_value) { + assert(s); + assert(value); + } + + Slice key; + Status* s; + PinnableWideColumns existing; + MergeContext merge_context; + PinnableSlice* value; + }; + + autovector merges; + + autovector key_contexts; + // Since the lifetime of the WriteBatch is the same as that of the transaction // we cannot pin it as otherwise the returned value will not be available // after the transaction finishes. for (size_t i = 0; i < num_keys; ++i) { + const Slice& key = keys[i]; MergeContext merge_context; std::string batch_value; - Status* s = &statuses[i]; - PinnableSlice* pinnable_val = &values[i]; - pinnable_val->Reset(); + Status* const s = &statuses[i]; auto result = WriteBatchWithIndexInternal::GetFromBatch( - this, column_family, keys[i], &merge_context, &batch_value, s); + this, column_family, key, &merge_context, &batch_value, s); + + PinnableSlice* const pinnable_val = &values[i]; + pinnable_val->Reset(); if (result == WBWIIteratorImpl::kFound) { *pinnable_val->GetSelf() = std::move(batch_value); pinnable_val->PinSelf(); continue; } + if (result == WBWIIteratorImpl::kDeleted) { *s = Status::NotFound(); continue; } + if (result == WBWIIteratorImpl::kError) { continue; } - assert(result == WBWIIteratorImpl::kMergeInProgress || - result == WBWIIteratorImpl::kNotFound); - key_context.emplace_back(column_family, keys[i], &values[i], - /* columns */ nullptr, /* timestamp */ nullptr, - &statuses[i]); - merges.emplace_back(result, std::move(merge_context)); + + // Note: we have to retrieve all columns if we have to merge KVs from the + // batch and the DB; otherwise, the default column is sufficient. + // The columns field will be populated by the loop below to prevent issues + // with dangling pointers. + if (result == WBWIIteratorImpl::kMergeInProgress) { + merges.emplace_back(key, s, std::move(merge_context), pinnable_val); + key_contexts.emplace_back(column_family, key, /* value */ nullptr, + /* columns */ nullptr, /* timestamp */ nullptr, + s); + continue; + } + + assert(result == WBWIIteratorImpl::kNotFound); + key_contexts.emplace_back(column_family, key, pinnable_val, + /* columns */ nullptr, + /* timestamp */ nullptr, s); } - for (KeyContext& key : key_context) { - sorted_keys.emplace_back(&key); + autovector sorted_keys; + sorted_keys.reserve(key_contexts.size()); + + size_t merges_idx = 0; + for (KeyContext& key_context : key_contexts) { + if (!key_context.value) { + assert(*key_context.key == merges[merges_idx].key); + + key_context.columns = &merges[merges_idx].existing; + ++merges_idx; + } + + sorted_keys.emplace_back(&key_context); } // Did not find key in batch OR could not resolve Merges. Try DB. static_cast_with_check(db->GetRootDB()) - ->PrepareMultiGetKeys(key_context.size(), sorted_input, &sorted_keys); + ->PrepareMultiGetKeys(sorted_keys.size(), sorted_input, &sorted_keys); static_cast_with_check(db->GetRootDB()) ->MultiGetWithCallback(read_options, column_family, callback, &sorted_keys); - for (auto iter = key_context.begin(); iter != key_context.end(); ++iter) { - KeyContext& key = *iter; - if (key.s->ok() || key.s->IsNotFound()) { // DB Get Succeeded - size_t index = iter - key_context.begin(); - std::pair& merge_result = - merges[index]; - if (merge_result.first == WBWIIteratorImpl::kMergeInProgress) { - // Merge result from DB with merges in Batch - std::string merged_value; - - if (key.s->ok()) { - *key.s = WriteBatchWithIndexInternal::MergeKeyWithPlainBaseValue( - column_family, *key.key, *key.value, merge_result.second, - &merged_value); - } else { - assert(key.s->IsNotFound()); - *key.s = WriteBatchWithIndexInternal::MergeKeyWithNoBaseValue( - column_family, *key.key, merge_result.second, &merged_value); - } - if (key.s->ok()) { - key.value->Reset(); - *key.value->GetSelf() = std::move(merged_value); - key.value->PinSelf(); - } - } + for (const auto& merge : merges) { + if (merge.s->ok() || merge.s->IsNotFound()) { // DB lookup succeeded + MergeAcrossBatchAndDB(column_family, merge.key, merge.existing, + merge.merge_context, merge.value, merge.s); } } } diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/utilities/write_batch_with_index/write_batch_with_index_internal.cc index bedd5934d5b..84e30b7cc8c 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_internal.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_internal.cc @@ -8,6 +8,8 @@ #include "db/column_family.h" #include "db/db_impl/db_impl.h" #include "db/merge_helper.h" +#include "db/wide/wide_column_serialization.h" +#include "db/wide/wide_columns_helper.h" #include "options/cf_options.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" @@ -247,45 +249,83 @@ void BaseDeltaIterator::AdvanceBase() { bool BaseDeltaIterator::BaseValid() const { return base_iterator_->Valid(); } bool BaseDeltaIterator::DeltaValid() const { return delta_iterator_->Valid(); } -void BaseDeltaIterator::ResetValue() { value_.clear(); } +void BaseDeltaIterator::ResetValueAndColumns() { + value_.clear(); + columns_.clear(); +} -void BaseDeltaIterator::SetValueFromBase() { +void BaseDeltaIterator::SetValueAndColumnsFromBase() { assert(current_at_base_); assert(BaseValid()); assert(value_.empty()); + assert(columns_.empty()); value_ = base_iterator_->value(); + columns_ = base_iterator_->columns(); } -void BaseDeltaIterator::SetValueFromDelta() { +void BaseDeltaIterator::SetValueAndColumnsFromDelta() { assert(!current_at_base_); assert(DeltaValid()); assert(value_.empty()); + assert(columns_.empty()); WriteEntry delta_entry = delta_iterator_->Entry(); if (merge_context_.GetNumOperands() == 0) { - value_ = delta_entry.value; + if (delta_entry.type == kPutRecord) { + value_ = delta_entry.value; + columns_.emplace_back(kDefaultWideColumnName, value_); + } else if (delta_entry.type == kPutEntityRecord) { + Slice value_copy(delta_entry.value); + + status_ = WideColumnSerialization::Deserialize(value_copy, columns_); + if (!status_.ok()) { + return; + } + + if (WideColumnsHelper::HasDefaultColumn(columns_)) { + value_ = WideColumnsHelper::GetDefaultColumn(columns_); + } + } return; } + ValueType result_type = kTypeValue; + if (delta_entry.type == kDeleteRecord || delta_entry.type == kSingleDeleteRecord) { status_ = WriteBatchWithIndexInternal::MergeKeyWithNoBaseValue( - column_family_, delta_entry.key, merge_context_, &merge_result_); + column_family_, delta_entry.key, merge_context_, &merge_result_, + /* result_operand */ nullptr, &result_type); } else if (delta_entry.type == kPutRecord) { - status_ = WriteBatchWithIndexInternal::MergeKeyWithPlainBaseValue( - column_family_, delta_entry.key, delta_entry.value, merge_context_, - &merge_result_); + status_ = WriteBatchWithIndexInternal::MergeKeyWithBaseValue( + column_family_, delta_entry.key, MergeHelper::kPlainBaseValue, + delta_entry.value, merge_context_, &merge_result_, + /* result_operand */ nullptr, &result_type); + } else if (delta_entry.type == kPutEntityRecord) { + status_ = WriteBatchWithIndexInternal::MergeKeyWithBaseValue( + column_family_, delta_entry.key, MergeHelper::kWideBaseValue, + delta_entry.value, merge_context_, &merge_result_, + /* result_operand */ nullptr, &result_type); } else if (delta_entry.type == kMergeRecord) { if (equal_keys_) { - status_ = WriteBatchWithIndexInternal::MergeKeyWithPlainBaseValue( - column_family_, delta_entry.key, base_iterator_->value(), - merge_context_, &merge_result_); + if (WideColumnsHelper::HasDefaultColumnOnly(base_iterator_->columns())) { + status_ = WriteBatchWithIndexInternal::MergeKeyWithBaseValue( + column_family_, delta_entry.key, MergeHelper::kPlainBaseValue, + base_iterator_->value(), merge_context_, &merge_result_, + /* result_operand */ nullptr, &result_type); + } else { + status_ = WriteBatchWithIndexInternal::MergeKeyWithBaseValue( + column_family_, delta_entry.key, MergeHelper::kWideBaseValue, + base_iterator_->columns(), merge_context_, &merge_result_, + /* result_operand */ nullptr, &result_type); + } } else { status_ = WriteBatchWithIndexInternal::MergeKeyWithNoBaseValue( - column_family_, delta_entry.key, merge_context_, &merge_result_); + column_family_, delta_entry.key, merge_context_, &merge_result_, + /* result_operand */ nullptr, &result_type); } } else { status_ = Status::NotSupported("Unsupported entry type for merge"); @@ -295,14 +335,32 @@ void BaseDeltaIterator::SetValueFromDelta() { return; } + if (result_type == kTypeWideColumnEntity) { + Slice entity(merge_result_); + + status_ = WideColumnSerialization::Deserialize(entity, columns_); + if (!status_.ok()) { + return; + } + + if (WideColumnsHelper::HasDefaultColumn(columns_)) { + value_ = WideColumnsHelper::GetDefaultColumn(columns_); + } + + return; + } + + assert(result_type == kTypeValue); + value_ = merge_result_; + columns_.emplace_back(kDefaultWideColumnName, value_); } void BaseDeltaIterator::UpdateCurrent() { // Suppress false positive clang analyzer warnings. #ifndef __clang_analyzer__ status_ = Status::OK(); - ResetValue(); + ResetValueAndColumns(); while (true) { auto delta_result = WBWIIteratorImpl::kNotFound; @@ -334,13 +392,13 @@ void BaseDeltaIterator::UpdateCurrent() { AdvanceDelta(); } else { current_at_base_ = false; - SetValueFromDelta(); + SetValueAndColumnsFromDelta(); return; } } else if (!DeltaValid()) { // Delta has finished. current_at_base_ = true; - SetValueFromBase(); + SetValueAndColumnsFromBase(); return; } else { int compare = @@ -354,7 +412,7 @@ void BaseDeltaIterator::UpdateCurrent() { if (delta_result != WBWIIteratorImpl::kDeleted || merge_context_.GetNumOperands() > 0) { current_at_base_ = false; - SetValueFromDelta(); + SetValueAndColumnsFromDelta(); return; } // Delta is less advanced and is delete. @@ -364,7 +422,7 @@ void BaseDeltaIterator::UpdateCurrent() { } } else { current_at_base_ = true; - SetValueFromBase(); + SetValueAndColumnsFromBase(); return; } } @@ -457,6 +515,8 @@ WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate( break; // ignore case kXIDRecord: break; // ignore + case kPutEntityRecord: + return WBWIIteratorImpl::kFound; default: return WBWIIteratorImpl::kError; } // end switch statement @@ -493,9 +553,10 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, } Slice input = Slice(rep_.data() + data_offset, rep_.size() - data_offset); char tag; - uint32_t column_family; + uint32_t column_family = 0; // default + uint64_t unix_write_time = 0; Status s = ReadRecordFromWriteBatch(&input, &tag, &column_family, key, value, - blob, xid); + blob, xid, &unix_write_time); if (!s.ok()) { return s; } @@ -533,6 +594,16 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, case kTypeRollbackXID: *type = kXIDRecord; break; + case kTypeColumnFamilyWideColumnEntity: + case kTypeWideColumnEntity: { + *type = kPutEntityRecord; + break; + } + case kTypeColumnFamilyValuePreferredSeqno: + case kTypeValuePreferredSeqno: + // TimedPut is not supported in Transaction APIs. + return Status::Corruption("unexpected WriteBatch tag ", + std::to_string(static_cast(tag))); default: return Status::Corruption("unknown WriteBatch tag ", std::to_string(static_cast(tag))); @@ -632,9 +703,9 @@ WriteEntry WBWIIteratorImpl::Entry() const { auto s = write_batch_->GetEntryFromDataOffset( iter_entry->offset, &ret.type, &ret.key, &ret.value, &blob, &xid); assert(s.ok()); - assert(ret.type == kPutRecord || ret.type == kDeleteRecord || - ret.type == kSingleDeleteRecord || ret.type == kDeleteRangeRecord || - ret.type == kMergeRecord); + assert(ret.type == kPutRecord || ret.type == kPutEntityRecord || + ret.type == kDeleteRecord || ret.type == kSingleDeleteRecord || + ret.type == kDeleteRangeRecord || ret.type == kMergeRecord); // Make sure entry.key does not include user-defined timestamp. const Comparator* const ucmp = comparator_->GetComparator(column_family_id_); size_t ts_sz = ucmp->timestamp_size(); @@ -652,98 +723,205 @@ bool WBWIIteratorImpl::MatchesKey(uint32_t cf_id, const Slice& key) { } } -Status WriteBatchWithIndexInternal::MergeKeyWithNoBaseValue( - ColumnFamilyHandle* column_family, const Slice& key, - const MergeContext& context, std::string* result) { - // TODO: support wide columns in WBWI +Status WriteBatchWithIndexInternal::CheckAndGetImmutableOptions( + ColumnFamilyHandle* column_family, const ImmutableOptions** ioptions) { + assert(ioptions); + assert(!*ioptions); if (!column_family) { return Status::InvalidArgument("Must provide a column family"); } - const auto& ioptions = GetImmutableOptions(column_family); + const auto& iopts = GetImmutableOptions(column_family); - const auto* merge_operator = ioptions.merge_operator.get(); + const auto* merge_operator = iopts.merge_operator.get(); if (!merge_operator) { return Status::InvalidArgument( "Merge operator must be set for column family"); } - // `op_failure_scope` (an output parameter) is not provided (set to - // nullptr) since a failure must be propagated regardless of its value. - return MergeHelper::TimedFullMerge( - merge_operator, key, MergeHelper::kNoBaseValue, context.GetOperands(), - ioptions.logger, ioptions.stats, ioptions.clock, - /* update_num_ops_stats */ false, result, - /* columns */ nullptr, /* op_failure_scope */ nullptr); -} - -Status WriteBatchWithIndexInternal::MergeKeyWithPlainBaseValue( - ColumnFamilyHandle* column_family, const Slice& key, const Slice& value, - const MergeContext& context, std::string* result) { - // TODO: support wide columns in WBWI + *ioptions = &iopts; - if (!column_family) { - return Status::InvalidArgument("Must provide a column family"); - } - - const auto& ioptions = GetImmutableOptions(column_family); - - const auto* merge_operator = ioptions.merge_operator.get(); - if (!merge_operator) { - return Status::InvalidArgument( - "Merge operator must be set for column family"); - } - - // `op_failure_scope` (an output parameter) is not provided (set to - // nullptr) since a failure must be propagated regardless of its value. - return MergeHelper::TimedFullMerge( - merge_operator, key, MergeHelper::kPlainBaseValue, value, - context.GetOperands(), ioptions.logger, ioptions.stats, ioptions.clock, - /* update_num_ops_stats */ false, result, - /* columns */ nullptr, /* op_failure_scope */ nullptr); + return Status::OK(); } -WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatch( +template +WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatchImpl( WriteBatchWithIndex* batch, ColumnFamilyHandle* column_family, - const Slice& key, MergeContext* context, std::string* value, Status* s) { - *s = Status::OK(); + const Slice& key, MergeContext* context, + typename Traits::OutputType* output, Status* s) { + assert(batch); + assert(context); + assert(output); + assert(s); std::unique_ptr iter( static_cast_with_check( batch->NewIterator(column_family))); - // Search the iterator for this key, and updates/merges to it. iter->Seek(key); auto result = iter->FindLatestUpdate(key, context); + if (result == WBWIIteratorImpl::kError) { - (*s) = Status::Corruption("Unexpected entry in WriteBatchWithIndex:", - std::to_string(iter->Entry().type)); + Traits::ClearOutput(output); + *s = Status::Corruption("Unexpected entry in WriteBatchWithIndex:", + std::to_string(iter->Entry().type)); return result; - } else if (result == WBWIIteratorImpl::kNotFound) { + } + + if (result == WBWIIteratorImpl::kNotFound) { + Traits::ClearOutput(output); + *s = Status::OK(); return result; - } else if (result == WBWIIteratorImpl::Result::kFound) { // PUT - Slice entry_value = iter->Entry().value; + } + + auto resolve_merge_outputs = [](auto out) { + std::string* output_value = nullptr; + PinnableWideColumns* output_entity = nullptr; + + if constexpr (std::is_same_v) { + output_value = out; + } else { + static_assert( + std::is_same_v, + "unexpected type"); + output_entity = out; + } + + return std::pair(output_value, + output_entity); + }; + + if (result == WBWIIteratorImpl::Result::kFound) { // Put/PutEntity + WriteEntry entry = iter->Entry(); + if (context->GetNumOperands() > 0) { - *s = MergeKeyWithPlainBaseValue(column_family, key, entry_value, *context, - value); - if (!s->ok()) { - result = WBWIIteratorImpl::Result::kError; + auto [output_value, output_entity] = resolve_merge_outputs(output); + + if (entry.type == kPutRecord) { + *s = MergeKeyWithBaseValue(column_family, key, + MergeHelper::kPlainBaseValue, entry.value, + *context, output_value, output_entity); + } else { + assert(entry.type == kPutEntityRecord); + + *s = MergeKeyWithBaseValue(column_family, key, + MergeHelper::kWideBaseValue, entry.value, + *context, output_value, output_entity); } } else { - value->assign(entry_value.data(), entry_value.size()); + if (entry.type == kPutRecord) { + *s = Traits::SetPlainValue(entry.value, output); + } else { + assert(entry.type == kPutEntityRecord); + *s = Traits::SetWideColumnValue(entry.value, output); + } + } + + if (!s->ok()) { + Traits::ClearOutput(output); + result = WBWIIteratorImpl::Result::kError; } - } else if (result == WBWIIteratorImpl::kDeleted) { + + return result; + } + + if (result == WBWIIteratorImpl::kDeleted) { if (context->GetNumOperands() > 0) { - *s = MergeKeyWithNoBaseValue(column_family, key, *context, value); + auto [output_value, output_entity] = resolve_merge_outputs(output); + + *s = MergeKeyWithNoBaseValue(column_family, key, *context, output_value, + output_entity); if (s->ok()) { result = WBWIIteratorImpl::Result::kFound; } else { + Traits::ClearOutput(output); result = WBWIIteratorImpl::Result::kError; } } + + return result; } + + assert(result == WBWIIteratorImpl::Result::kMergeInProgress); + + Traits::ClearOutput(output); + *s = Status::OK(); return result; } +WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatch( + WriteBatchWithIndex* batch, ColumnFamilyHandle* column_family, + const Slice& key, MergeContext* context, std::string* value, Status* s) { + struct Traits { + using OutputType = std::string; + + static void ClearOutput(OutputType* output) { + assert(output); + output->clear(); + } + + static Status SetPlainValue(const Slice& value, OutputType* output) { + assert(output); + output->assign(value.data(), value.size()); + + return Status::OK(); + } + + static Status SetWideColumnValue(const Slice& entity, OutputType* output) { + assert(output); + + Slice entity_copy = entity; + Slice value_of_default; + const Status s = WideColumnSerialization::GetValueOfDefaultColumn( + entity_copy, value_of_default); + if (!s.ok()) { + ClearOutput(output); + return s; + } + + output->assign(value_of_default.data(), value_of_default.size()); + return Status::OK(); + } + }; + + return GetFromBatchImpl(batch, column_family, key, context, value, s); +} + +WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetEntityFromBatch( + WriteBatchWithIndex* batch, ColumnFamilyHandle* column_family, + const Slice& key, MergeContext* context, PinnableWideColumns* columns, + Status* s) { + struct Traits { + using OutputType = PinnableWideColumns; + + static void ClearOutput(OutputType* output) { + assert(output); + output->Reset(); + } + + static Status SetPlainValue(const Slice& value, OutputType* output) { + assert(output); + output->SetPlainValue(value); + + return Status::OK(); + } + + static Status SetWideColumnValue(const Slice& entity, OutputType* output) { + assert(output); + + const Status s = output->SetWideColumnValue(entity); + if (!s.ok()) { + ClearOutput(output); + return s; + } + + return Status::OK(); + } + }; + + return GetFromBatchImpl(batch, column_family, key, context, columns, + s); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.h b/utilities/write_batch_with_index/write_batch_with_index_internal.h index c4135ad3264..163de2014d4 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_internal.h +++ b/utilities/write_batch_with_index/write_batch_with_index_internal.h @@ -8,7 +8,9 @@ #include #include +#include "db/dbformat.h" #include "db/merge_context.h" +#include "db/merge_helper.h" #include "memtable/skiplist.h" #include "options/db_options.h" #include "port/port.h" @@ -47,6 +49,7 @@ class BaseDeltaIterator : public Iterator { void Prev() override; Slice key() const override; Slice value() const override { return value_; } + const WideColumns& columns() const override { return columns_; } Slice timestamp() const override; Status status() const override; void Invalidate(Status s); @@ -58,9 +61,9 @@ class BaseDeltaIterator : public Iterator { void AdvanceBase(); bool BaseValid() const; bool DeltaValid() const; - void ResetValue(); - void SetValueFromBase(); - void SetValueFromDelta(); + void ResetValueAndColumns(); + void SetValueAndColumnsFromBase(); + void SetValueAndColumnsFromDelta(); void UpdateCurrent(); bool forward_; @@ -74,6 +77,7 @@ class BaseDeltaIterator : public Iterator { MergeContext merge_context_; std::string merge_result_; Slice value_; + WideColumns columns_; }; // Key used by skip list, as the binary searchable index of WriteBatchWithIndex. @@ -145,7 +149,7 @@ class ReadableWriteBatch : public WriteBatch { default_cf_ts_sz) {} // Retrieve some information from a write entry in the write batch, given // the start offset of the write entry. - Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key, + Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* key, Slice* value, Slice* blob, Slice* xid) const; }; @@ -319,12 +323,12 @@ class WBWIIteratorImpl : public WBWIIterator { // Moves the iterator to first entry of the next key. void NextKey(); - // Moves the iterator to the Update (Put or Delete) for the current key - // If there are no Put/Delete, the Iterator will point to the first entry for - // this key - // @return kFound if a Put was found for the key + // Moves the iterator to the Update (Put, PutEntity or Delete) for the current + // key. If there is no Put/PutEntity/Delete, the Iterator will point to the + // first entry for this key. + // @return kFound if a Put/PutEntity was found for the key // @return kDeleted if a delete was found for the key - // @return kMergeInProgress if only merges were fouund for the key + // @return kMergeInProgress if only merges were found for the key // @return kError if an unsupported operation was found for the key // @return kNotFound if no operations were found for this key // @@ -385,15 +389,52 @@ class WriteBatchWithIndexInternal { static const Comparator* GetUserComparator(const WriteBatchWithIndex& wbwi, uint32_t cf_id); + template static Status MergeKeyWithNoBaseValue(ColumnFamilyHandle* column_family, const Slice& key, const MergeContext& context, - std::string* result); + ResultTs... results) { + const ImmutableOptions* ioptions = nullptr; - static Status MergeKeyWithPlainBaseValue(ColumnFamilyHandle* column_family, - const Slice& key, const Slice& value, - const MergeContext& context, - std::string* result); + const Status s = CheckAndGetImmutableOptions(column_family, &ioptions); + if (!s.ok()) { + return s; + } + + assert(ioptions); + + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + return MergeHelper::TimedFullMerge( + ioptions->merge_operator.get(), key, MergeHelper::kNoBaseValue, + context.GetOperands(), ioptions->logger, ioptions->stats, + ioptions->clock, /* update_num_ops_stats */ false, + /* op_failure_scope */ nullptr, results...); + } + + template + static Status MergeKeyWithBaseValue(ColumnFamilyHandle* column_family, + const Slice& key, const BaseTag& base_tag, + const BaseT& value, + const MergeContext& context, + ResultTs... results) { + const ImmutableOptions* ioptions = nullptr; + + const Status s = CheckAndGetImmutableOptions(column_family, &ioptions); + if (!s.ok()) { + return s; + } + + assert(ioptions); + + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + return MergeHelper::TimedFullMerge( + ioptions->merge_operator.get(), key, base_tag, value, + context.GetOperands(), ioptions->logger, ioptions->stats, + ioptions->clock, /* update_num_ops_stats */ false, + /* op_failure_scope */ nullptr, results...); + } // If batch contains a value for key, store it in *value and return kFound. // If batch contains a deletion for key, return Deleted. @@ -407,6 +448,21 @@ class WriteBatchWithIndexInternal { WriteBatchWithIndex* batch, ColumnFamilyHandle* column_family, const Slice& key, MergeContext* merge_context, std::string* value, Status* s); + + static WBWIIteratorImpl::Result GetEntityFromBatch( + WriteBatchWithIndex* batch, ColumnFamilyHandle* column_family, + const Slice& key, MergeContext* merge_context, + PinnableWideColumns* columns, Status* s); + + private: + static Status CheckAndGetImmutableOptions(ColumnFamilyHandle* column_family, + const ImmutableOptions** ioptions); + + template + static WBWIIteratorImpl::Result GetFromBatchImpl( + WriteBatchWithIndex* batch, ColumnFamilyHandle* column_family, + const Slice& key, MergeContext* merge_context, + typename Traits::OutputType* output, Status* s); }; } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc index 95333d8f470..cb8e3a343c6 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_test.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -13,6 +13,7 @@ #include #include "db/column_family.h" +#include "db/wide/wide_columns_helper.h" #include "port/stack_trace.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -81,37 +82,83 @@ using KVMap = std::map; class KVIter : public Iterator { public: explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {} + bool Valid() const override { return iter_ != map_->end(); } - void SeekToFirst() override { iter_ = map_->begin(); } + + void SeekToFirst() override { + iter_ = map_->begin(); + + if (Valid()) { + Update(); + } + } + void SeekToLast() override { if (map_->empty()) { iter_ = map_->end(); } else { iter_ = map_->find(map_->rbegin()->first); } + + if (Valid()) { + Update(); + } } + void Seek(const Slice& k) override { iter_ = map_->lower_bound(k.ToString()); + + if (Valid()) { + Update(); + } } + void SeekForPrev(const Slice& k) override { iter_ = map_->upper_bound(k.ToString()); Prev(); + + if (Valid()) { + Update(); + } + } + + void Next() override { + ++iter_; + + if (Valid()) { + Update(); + } } - void Next() override { ++iter_; } + void Prev() override { if (iter_ == map_->begin()) { iter_ = map_->end(); return; } --iter_; + + if (Valid()) { + Update(); + } } + Slice key() const override { return iter_->first; } - Slice value() const override { return iter_->second; } + Slice value() const override { return value_; } + const WideColumns& columns() const override { return columns_; } Status status() const override { return Status::OK(); } private: + void Update() { + assert(Valid()); + + value_ = iter_->second; + columns_ = WideColumns{{kDefaultWideColumnName, value_}}; + } + const KVMap* const map_; KVMap::const_iterator iter_; + Slice value_; + WideColumns columns_; }; static std::string PrintContents(WriteBatchWithIndex* batch, @@ -221,7 +268,7 @@ void AssertItersEqual(Iterator* iter1, Iterator* iter2) { void AssertIterEqual(WBWIIteratorImpl* wbwii, const std::vector& keys) { wbwii->SeekToFirst(); - for (auto k : keys) { + for (const auto& k : keys) { ASSERT_TRUE(wbwii->Valid()); ASSERT_EQ(wbwii->Entry().key, k); wbwii->NextKey(); @@ -265,6 +312,12 @@ class WBWIBaseTest : public testing::Test { } else if (key[i] == 'p') { result = key + std::to_string(i); EXPECT_OK(batch_->Put(cf, key, result)); + } else if (key[i] == 'e') { + const std::string suffix = std::to_string(i); + result = key + suffix; + const WideColumns columns{{kDefaultWideColumnName, result}, + {key, suffix}}; + EXPECT_OK(batch_->PutEntity(cf, key, columns)); } else if (key[i] == 'm') { std::string value = key + std::to_string(i); EXPECT_OK(batch_->Merge(cf, key, value)); @@ -358,7 +411,7 @@ void TestValueAsSecondaryIndexHelper(std::vector entries, } else { iter->Seek(""); } - for (auto pair : data_map) { + for (const auto& pair : data_map) { for (auto v : pair.second) { ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); @@ -399,7 +452,7 @@ void TestValueAsSecondaryIndexHelper(std::vector entries, } else { iter->Seek(""); } - for (auto pair : index_map) { + for (const auto& pair : index_map) { for (auto v : pair.second) { ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); @@ -484,7 +537,7 @@ void TestValueAsSecondaryIndexHelper(std::vector entries, { ASSERT_EQ(entries.size(), handler.seen[data.GetID()].size()); size_t i = 0; - for (auto e : handler.seen[data.GetID()]) { + for (const auto& e : handler.seen[data.GetID()]) { auto write_entry = entries[i++]; ASSERT_EQ(e.type, write_entry.type); ASSERT_EQ(e.key, write_entry.key); @@ -498,7 +551,7 @@ void TestValueAsSecondaryIndexHelper(std::vector entries, { ASSERT_EQ(entries.size(), handler.seen[index.GetID()].size()); size_t i = 0; - for (auto e : handler.seen[index.GetID()]) { + for (const auto& e : handler.seen[index.GetID()]) { auto write_entry = entries[i++]; ASSERT_EQ(e.key, write_entry.value); if (write_entry.type != kDeleteRecord) { @@ -771,7 +824,7 @@ TEST_P(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) { KVMap map; KVMap merged_map; - for (auto key : source_strings) { + for (const auto& key : source_strings) { std::string value = key + key; int type = rnd.Uniform(6); switch (type) { @@ -2319,7 +2372,7 @@ TEST_P(WriteBatchWithIndexTest, GetAfterMergeDelete) { TEST_F(WBWIOverwriteTest, TestBadMergeOperator) { class FailingMergeOperator : public MergeOperator { public: - FailingMergeOperator() {} + FailingMergeOperator() = default; bool FullMergeV2(const MergeOperationInput& /*merge_in*/, MergeOperationOutput* /*merge_out*/) const override { @@ -2496,6 +2549,366 @@ TEST_P(WriteBatchWithIndexTest, IndexNoTs) { } } +TEST_P(WriteBatchWithIndexTest, WideColumnsBatchOnly) { + // Tests for the case when there's no need to consult the underlying DB during + // queries, i.e. when all queries can be answered using the write batch only. + + ASSERT_OK(OpenDB()); + + constexpr size_t num_keys = 6; + + constexpr char delete_key[] = "d"; + constexpr char delete_merge_key[] = "dm"; + constexpr char put_entity_key[] = "e"; + constexpr char put_entity_merge_key[] = "em"; + constexpr char put_key[] = "p"; + constexpr char put_merge_key[] = "pm"; + + AddToBatch(db_->DefaultColumnFamily(), delete_key); + AddToBatch(db_->DefaultColumnFamily(), delete_merge_key); + AddToBatch(db_->DefaultColumnFamily(), put_entity_key); + AddToBatch(db_->DefaultColumnFamily(), put_entity_merge_key); + AddToBatch(db_->DefaultColumnFamily(), put_key); + AddToBatch(db_->DefaultColumnFamily(), put_merge_key); + + std::array keys{{delete_key, delete_merge_key, + put_entity_key, put_entity_merge_key, + put_key, put_merge_key}}; + + std::array expected{ + {{}, + {{kDefaultWideColumnName, "dm1"}}, + {{kDefaultWideColumnName, "e0"}, {"e", "0"}}, + {{kDefaultWideColumnName, "em0,em1"}, {"em", "0"}}, + {{kDefaultWideColumnName, "p0"}}, + {{kDefaultWideColumnName, "pm0,pm1"}}}}; + + // GetFromBatchAndDB + { + PinnableSlice value; + ASSERT_TRUE(batch_->GetFromBatchAndDB(db_, read_opts_, delete_key, &value) + .IsNotFound()); + } + + for (size_t i = 1; i < num_keys; ++i) { + PinnableSlice value; + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, keys[i], &value)); + ASSERT_EQ(value, expected[i].front().value()); + } + + // MultiGetFromBatchAndDB + { + std::array values; + std::array statuses; + constexpr bool sorted_input = false; + + batch_->MultiGetFromBatchAndDB(db_, read_opts_, db_->DefaultColumnFamily(), + num_keys, keys.data(), values.data(), + statuses.data(), sorted_input); + + ASSERT_TRUE(statuses[0].IsNotFound()); + + for (size_t i = 1; i < num_keys; ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(values[i], expected[i].front().value()); + } + } + + // TODO: add tests for GetEntityFromBatchAndDB and + // MultiGetEntityFromBatchAndDB once they are implemented + + // Iterator + std::unique_ptr iter(batch_->NewIteratorWithBase( + db_->DefaultColumnFamily(), db_->NewIterator(read_opts_), &read_opts_)); + + iter->SeekToFirst(); + + for (size_t i = 1; i < num_keys; ++i) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), keys[i]); + ASSERT_EQ(iter->value(), expected[i].front().value()); + ASSERT_EQ(iter->columns(), expected[i]); + iter->Next(); + } + + ASSERT_FALSE(iter->Valid()); + + iter->SeekToLast(); + + for (size_t i = num_keys - 1; i > 0; --i) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), keys[i]); + ASSERT_EQ(iter->value(), expected[i].front().value()); + ASSERT_EQ(iter->columns(), expected[i]); + iter->Prev(); + } + + ASSERT_FALSE(iter->Valid()); +} + +TEST_P(WriteBatchWithIndexTest, WideColumnsBatchAndDB) { + // Tests for the case when queries require consulting both the write batch and + // the underlying DB, either because of merges or because the write batch + // doesn't contain the key. + + ASSERT_OK(OpenDB()); + + constexpr size_t num_keys = 6; + + // Note: for the "merge" keys, we'll have a merge operation in the write batch + // and the base value (Put/PutEntity/Delete) in the DB. For the "no-merge" + // keys, we'll have nothing in the write batch and a standalone + // Put/PutEntity/Delete in the DB. + constexpr char merge_a_key[] = "ma"; + constexpr char merge_b_key[] = "mb"; + constexpr char merge_c_key[] = "mc"; + constexpr char no_merge_a_key[] = "na"; + constexpr char no_merge_b_key[] = "nb"; + constexpr char no_merge_c_key[] = "nc"; + + constexpr char merge_a_value[] = "mao"; + const WideColumns merge_b_columns{{kDefaultWideColumnName, "mbo"}, + {"mb", "o"}}; + constexpr char no_merge_a_value[] = "nao"; + const WideColumns no_merge_b_columns{{kDefaultWideColumnName, "nbo"}, + {"nb", "o"}}; + + ASSERT_OK(db_->Put(write_opts_, db_->DefaultColumnFamily(), merge_a_key, + merge_a_value)); + ASSERT_OK(db_->PutEntity(write_opts_, db_->DefaultColumnFamily(), merge_b_key, + merge_b_columns)); + ASSERT_OK(db_->Delete(write_opts_, db_->DefaultColumnFamily(), merge_c_key)); + ASSERT_OK(db_->Put(write_opts_, db_->DefaultColumnFamily(), no_merge_a_key, + no_merge_a_value)); + ASSERT_OK(db_->PutEntity(write_opts_, db_->DefaultColumnFamily(), + no_merge_b_key, no_merge_b_columns)); + ASSERT_OK( + db_->Delete(write_opts_, db_->DefaultColumnFamily(), no_merge_c_key)); + + AddToBatch(db_->DefaultColumnFamily(), merge_a_key); + AddToBatch(db_->DefaultColumnFamily(), merge_b_key); + AddToBatch(db_->DefaultColumnFamily(), merge_c_key); + + std::array keys{{merge_a_key, merge_b_key, merge_c_key, + no_merge_a_key, no_merge_b_key, + no_merge_c_key}}; + + std::array expected{ + {{{kDefaultWideColumnName, "mao,ma0"}}, + {{kDefaultWideColumnName, "mbo,mb0"}, {"mb", "o"}}, + {{kDefaultWideColumnName, "mc0"}}, + {{kDefaultWideColumnName, "nao"}}, + {{kDefaultWideColumnName, "nbo"}, {"nb", "o"}}, + {}}}; + + // GetFromBatchAndDB + for (size_t i = 0; i < num_keys - 1; ++i) { + PinnableSlice value; + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, keys[i], &value)); + ASSERT_EQ(value, expected[i].front().value()); + } + + { + PinnableSlice value; + ASSERT_TRUE( + batch_->GetFromBatchAndDB(db_, read_opts_, no_merge_c_key, &value) + .IsNotFound()); + } + + // MultiGetFromBatchAndDB + { + std::array values; + std::array statuses; + constexpr bool sorted_input = false; + + batch_->MultiGetFromBatchAndDB(db_, read_opts_, db_->DefaultColumnFamily(), + num_keys, keys.data(), values.data(), + statuses.data(), sorted_input); + + for (size_t i = 0; i < num_keys - 1; ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(values[i], expected[i].front().value()); + } + + ASSERT_TRUE(statuses[num_keys - 1].IsNotFound()); + } + + // TODO: add tests for GetEntityFromBatchAndDB and + // MultiGetEntityFromBatchAndDB once they are implemented + + // Iterator + std::unique_ptr iter(batch_->NewIteratorWithBase( + db_->DefaultColumnFamily(), db_->NewIterator(read_opts_), &read_opts_)); + + iter->SeekToFirst(); + + for (size_t i = 0; i < num_keys - 1; ++i) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), keys[i]); + ASSERT_EQ(iter->value(), expected[i].front().value()); + ASSERT_EQ(iter->columns(), expected[i]); + iter->Next(); + } + + ASSERT_FALSE(iter->Valid()); + + iter->SeekToLast(); + + for (size_t i = 0; i < num_keys - 1; ++i) { + const size_t idx = num_keys - 2 - i; + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), keys[idx]); + ASSERT_EQ(iter->value(), expected[idx].front().value()); + ASSERT_EQ(iter->columns(), expected[idx]); + iter->Prev(); + } + + ASSERT_FALSE(iter->Valid()); +} + +TEST_P(WriteBatchWithIndexTest, GetEntityFromBatch) { + ASSERT_OK(OpenDB()); + + // No base value, no merges => NotFound + { + constexpr char key[] = "a"; + + PinnableWideColumns result; + ASSERT_TRUE( + batch_->GetEntityFromBatch(db_->DefaultColumnFamily(), key, &result) + .IsNotFound()); + } + + // No base value, with merges => MergeInProgress + { + constexpr char key[] = "b"; + constexpr char merge_op1[] = "bv1"; + constexpr char merge_op2[] = "bv2"; + + ASSERT_OK(batch_->Merge("b", merge_op1)); + ASSERT_OK(batch_->Merge("b", merge_op2)); + + PinnableWideColumns result; + ASSERT_TRUE( + batch_->GetEntityFromBatch(db_->DefaultColumnFamily(), key, &result) + .IsMergeInProgress()); + } + + // Plain value, no merges => Found + { + constexpr char key[] = "c"; + constexpr char value[] = "cv"; + + ASSERT_OK(batch_->Put(key, value)); + + PinnableWideColumns result; + ASSERT_OK( + batch_->GetEntityFromBatch(db_->DefaultColumnFamily(), key, &result)); + + const WideColumns expected{{kDefaultWideColumnName, value}}; + ASSERT_EQ(result.columns(), expected); + } + + // Wide-column value, no merges => Found + { + constexpr char key[] = "d"; + const WideColumns columns{ + {kDefaultWideColumnName, "d0v"}, {"1", "d1v"}, {"2", "d2v"}}; + + ASSERT_OK(batch_->PutEntity(db_->DefaultColumnFamily(), key, columns)); + + PinnableWideColumns result; + ASSERT_OK( + batch_->GetEntityFromBatch(db_->DefaultColumnFamily(), key, &result)); + + ASSERT_EQ(result.columns(), columns); + } + + // Plain value, with merges => Found + { + constexpr char key[] = "e"; + constexpr char base_value[] = "ev0"; + constexpr char merge_op1[] = "ev1"; + constexpr char merge_op2[] = "ev2"; + + ASSERT_OK(batch_->Put(key, base_value)); + ASSERT_OK(batch_->Merge(key, merge_op1)); + ASSERT_OK(batch_->Merge(key, merge_op2)); + + PinnableWideColumns result; + ASSERT_OK( + batch_->GetEntityFromBatch(db_->DefaultColumnFamily(), key, &result)); + + const WideColumns expected{{kDefaultWideColumnName, "ev0,ev1,ev2"}}; + ASSERT_EQ(result.columns(), expected); + } + + // Wide-column value, with merges => Found + { + constexpr char key[] = "f"; + const WideColumns base_columns{ + {kDefaultWideColumnName, "f0v0"}, {"1", "f1v"}, {"2", "f2v"}}; + constexpr char merge_op1[] = "f0v1"; + constexpr char merge_op2[] = "f0v2"; + + ASSERT_OK(batch_->PutEntity(db_->DefaultColumnFamily(), key, base_columns)); + ASSERT_OK(batch_->Merge(key, merge_op1)); + ASSERT_OK(batch_->Merge(key, merge_op2)); + + PinnableWideColumns result; + ASSERT_OK( + batch_->GetEntityFromBatch(db_->DefaultColumnFamily(), key, &result)); + + const WideColumns expected{{kDefaultWideColumnName, "f0v0,f0v1,f0v2"}, + base_columns[1], + base_columns[2]}; + ASSERT_EQ(result.columns(), expected); + } + + // Delete, no merges => NotFound + { + constexpr char key[] = "g"; + + ASSERT_OK(batch_->Delete(key)); + + PinnableWideColumns result; + ASSERT_TRUE( + batch_->GetEntityFromBatch(db_->DefaultColumnFamily(), key, &result) + .IsNotFound()); + } + + // Delete, with merges => Found + { + constexpr char key[] = "h"; + constexpr char merge_op1[] = "hv1"; + constexpr char merge_op2[] = "hv2"; + + ASSERT_OK(batch_->Delete(key)); + ASSERT_OK(batch_->Merge(key, merge_op1)); + ASSERT_OK(batch_->Merge(key, merge_op2)); + + PinnableWideColumns result; + ASSERT_OK( + batch_->GetEntityFromBatch(db_->DefaultColumnFamily(), key, &result)); + + const WideColumns expected{{kDefaultWideColumnName, "hv1,hv2"}}; + ASSERT_EQ(result.columns(), expected); + } + + // Validate parameters + { + constexpr char key[] = "foo"; + PinnableWideColumns result; + + ASSERT_TRUE( + batch_->GetEntityFromBatch(nullptr, key, &result).IsInvalidArgument()); + ASSERT_TRUE( + batch_->GetEntityFromBatch(db_->DefaultColumnFamily(), key, nullptr) + .IsInvalidArgument()); + } +} + INSTANTIATE_TEST_CASE_P(WBWI, WriteBatchWithIndexTest, testing::Bool()); } // namespace ROCKSDB_NAMESPACE